cpcloud commented on a change in pull request #11707:
URL: https://github.com/apache/arrow/pull/11707#discussion_r754161686



##########
File path: cpp/cmake_modules/DefineOptions.cmake
##########
@@ -465,6 +467,12 @@ advised that if this is enabled 'install' will fail 
silently on components;\
 that have not been built"
                 OFF)
 
+  set(ARROW_SUBSTRAIT_REPO_AND_TAG_DEFAULT
+      "https://github.com/substrait-io/substrait 
9e84da55393a24953ed9f9869fa423f86f4860f7")
+  define_option_string(ARROW_SUBSTRAIT_REPO_AND_TAG

Review comment:
       Not a big deal, but I imagine the repo changing approximately never, and 
the hash changing a lot at least initially. Is there value in splitting this 
into two separate options, or even just a single `ARROW_SUBSTRAIT_GIT_REF` 
option?

##########
File path: dev/archery/archery/lang/cpp.py
##########
@@ -42,7 +42,7 @@ def __init__(self,
                  cc=None, cxx=None, cxx_flags=None,
                  build_type=None, warn_level=None,
                  cpp_package_prefix=None, install_prefix=None, use_conda=None,
-                 build_static=False, build_shared=True, build_unity=True,
+                 build_static=True, build_shared=True, build_unity=True,

Review comment:
       What's the motivation/effect of setting this to `True`? Always building 
static libs as well as shared?

##########
File path: cpp/src/arrow/engine/CMakeLists.txt
##########
@@ -0,0 +1,124 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+add_custom_target(arrow_engine)
+
+arrow_install_all_headers("arrow/engine")
+
+set(ARROW_ENGINE_LINK_LIBS ${ARROW_PROTOBUF_LIBPROTOBUF})
+
+#if(WIN32)
+#  list(APPEND ARROW_ENGINE_LINK_LIBS ws2_32.lib)
+#endif()
+
+set(ARROW_ENGINE_SRCS
+    substrait/expression_internal.cc
+    substrait/extension_types.cc
+    substrait/serde.cc
+    substrait/type_internal.cc)
+
+set(SUBSTRAIT_DIR "${CMAKE_CURRENT_BINARY_DIR}/substrait")
+set(SUBSTRAIT_GEN_DIR "${ARROW_SOURCE_DIR}/src/generated/substrait")
+set(SUBSTRAIT_PROTOS expression
+                     extensions
+                     function
+                     parameterized_types
+                     plan
+                     relations
+                     selection
+                     type
+                     type_expressions)
+
+string(FIND "${ARROW_SUBSTRAIT_REPO_AND_TAG}" " " TAG_START)
+if(TAG_START EQUAL -1)
+  message(FATAL_ERROR "Cannot parse 
ARROW_SUBSTRAIT_REPO_AND_TAG='${ARROW_SUBSTRAIT_REPO_AND_TAG}'")
+endif()
+string(SUBSTRING "${ARROW_SUBSTRAIT_REPO_AND_TAG}" 0 ${TAG_START} 
ARROW_SUBSTRAIT_REPO)
+string(SUBSTRING "${ARROW_SUBSTRAIT_REPO_AND_TAG}" ${TAG_START} -1 
ARROW_SUBSTRAIT_TAG)
+string(STRIP "${ARROW_SUBSTRAIT_TAG}" ARROW_SUBSTRAIT_TAG)

Review comment:
       I think most of this string munging goes away if you go with a single 
`ARROW_SUBSTRAIT_GIT_REF` option and hard code the repo.

##########
File path: cpp/src/arrow/engine/substrait/expression_internal.cc
##########
@@ -0,0 +1,427 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// This API is EXPERIMENTAL.
+
+#include "arrow/engine/substrait/expression_internal.h"
+
+#include <utility>
+
+#include "arrow/builder.h"
+#include "arrow/compute/exec/expression.h"
+#include "arrow/engine/substrait/extension_types.h"
+#include "arrow/engine/substrait/type_internal.h"
+#include "arrow/engine/visibility.h"
+#include "arrow/result.h"
+#include "arrow/status.h"
+
+#include "arrow/util/make_unique.h"
+#include "generated/substrait/expression.pb.h"  // IWYU pragma: export
+
+namespace st = io::substrait;
+
+namespace arrow {
+namespace engine {
+namespace {
+
+std::shared_ptr<FixedSizeBinaryScalar> FixedSizeBinaryScalarFromBytes(
+    const std::string& bytes) {
+  auto buf = Buffer::FromString(bytes);
+  auto type = fixed_size_binary(static_cast<int>(buf->size()));
+  return std::make_shared<FixedSizeBinaryScalar>(std::move(buf), 
std::move(type));
+}
+
+}  // namespace
+
+Result<compute::Expression> FromProto(const st::Expression& expr) {
+  switch (expr.rex_type_case()) {
+    case st::Expression::kLiteral: {
+      ARROW_ASSIGN_OR_RAISE(auto datum, FromProto(expr.literal()));
+      return compute::literal(std::move(datum));
+    }
+
+    default:
+      break;
+  }
+
+  return Status::NotImplemented("conversion to arrow::compute::Expression from 
",
+                                expr.DebugString());
+}
+
+Result<Datum> FromProto(const st::Expression::Literal& lit) {
+  switch (lit.literal_type_case()) {
+    case st::Expression::Literal::kBoolean:
+      return Datum(lit.boolean());
+
+    case st::Expression::Literal::kI8:
+      return Datum(static_cast<int8_t>(lit.i8()));
+    case st::Expression::Literal::kI16:
+      return Datum(static_cast<int16_t>(lit.i16()));
+    case st::Expression::Literal::kI32:
+      return Datum(static_cast<int32_t>(lit.i32()));
+    case st::Expression::Literal::kI64:
+      return Datum(static_cast<int64_t>(lit.i64()));
+
+    case st::Expression::Literal::kFp32:
+      return Datum(lit.fp32());
+    case st::Expression::Literal::kFp64:
+      return Datum(lit.fp64());
+
+    case st::Expression::Literal::kString:
+      return Datum(lit.string());
+    case st::Expression::Literal::kBinary:
+      return 
Datum(std::make_shared<BinaryScalar>(Buffer::FromString(lit.binary())));
+
+    case st::Expression::Literal::kTimestamp:
+      return Datum(std::make_shared<TimestampScalar>(
+          static_cast<int64_t>(lit.timestamp()), TimeUnit::MICRO));
+
+    case st::Expression::Literal::kTimestampTz:
+      return Datum(std::make_shared<TimestampScalar>(
+          static_cast<int64_t>(lit.timestamp_tz()), TimeUnit::MICRO,
+          TimestampTzTimezoneString()));
+
+    case st::Expression::Literal::kDate:
+      return 
Datum(std::make_shared<Date32Scalar>(static_cast<int32_t>(lit.date())));
+    case st::Expression::Literal::kTime:
+      return 
Datum(std::make_shared<Time64Scalar>(static_cast<int64_t>(lit.time()),
+                                                  TimeUnit::MICRO));
+
+    case st::Expression::Literal::kIntervalYearToMonth:
+    case st::Expression::Literal::kIntervalDayToSecond:
+      break;
+
+    case st::Expression::Literal::kUuid:
+      return Datum(std::make_shared<ExtensionScalar>(
+          FixedSizeBinaryScalarFromBytes(lit.uuid()), uuid()));
+
+    case st::Expression::Literal::kFixedChar:
+      return Datum(std::make_shared<ExtensionScalar>(
+          FixedSizeBinaryScalarFromBytes(lit.fixed_char()),
+          fixed_char(static_cast<int32_t>(lit.fixed_char().size()))));
+
+    case st::Expression::Literal::kVarChar:
+      // FIXME
+      // There's no way to determine VarChar.length from the literal
+      break;
+
+    case st::Expression::Literal::kFixedBinary:
+      return Datum(FixedSizeBinaryScalarFromBytes(lit.fixed_char()));
+
+    case st::Expression::Literal::kDecimal:
+      if (lit.decimal().size() != sizeof(Decimal128)) {
+        return Status::Invalid("Decimal literal had ", lit.decimal().size(),
+                               " bytes (expected ", sizeof(Decimal128), ")");
+      }
+
+      // FIXME
+      // It's not clear how these bytes should be interpreted...

Review comment:
       Good point, this needs specifying.

##########
File path: cpp/src/arrow/engine/substrait/expression_internal.cc
##########
@@ -0,0 +1,427 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// This API is EXPERIMENTAL.
+
+#include "arrow/engine/substrait/expression_internal.h"
+
+#include <utility>
+
+#include "arrow/builder.h"
+#include "arrow/compute/exec/expression.h"
+#include "arrow/engine/substrait/extension_types.h"
+#include "arrow/engine/substrait/type_internal.h"
+#include "arrow/engine/visibility.h"
+#include "arrow/result.h"
+#include "arrow/status.h"
+
+#include "arrow/util/make_unique.h"
+#include "generated/substrait/expression.pb.h"  // IWYU pragma: export
+
+namespace st = io::substrait;
+
+namespace arrow {
+namespace engine {
+namespace {
+
+std::shared_ptr<FixedSizeBinaryScalar> FixedSizeBinaryScalarFromBytes(
+    const std::string& bytes) {
+  auto buf = Buffer::FromString(bytes);
+  auto type = fixed_size_binary(static_cast<int>(buf->size()));
+  return std::make_shared<FixedSizeBinaryScalar>(std::move(buf), 
std::move(type));
+}
+
+}  // namespace
+
+Result<compute::Expression> FromProto(const st::Expression& expr) {
+  switch (expr.rex_type_case()) {
+    case st::Expression::kLiteral: {
+      ARROW_ASSIGN_OR_RAISE(auto datum, FromProto(expr.literal()));
+      return compute::literal(std::move(datum));
+    }
+
+    default:
+      break;
+  }
+
+  return Status::NotImplemented("conversion to arrow::compute::Expression from 
",
+                                expr.DebugString());
+}
+
+Result<Datum> FromProto(const st::Expression::Literal& lit) {
+  switch (lit.literal_type_case()) {
+    case st::Expression::Literal::kBoolean:
+      return Datum(lit.boolean());
+
+    case st::Expression::Literal::kI8:
+      return Datum(static_cast<int8_t>(lit.i8()));
+    case st::Expression::Literal::kI16:
+      return Datum(static_cast<int16_t>(lit.i16()));
+    case st::Expression::Literal::kI32:
+      return Datum(static_cast<int32_t>(lit.i32()));
+    case st::Expression::Literal::kI64:
+      return Datum(static_cast<int64_t>(lit.i64()));
+
+    case st::Expression::Literal::kFp32:
+      return Datum(lit.fp32());
+    case st::Expression::Literal::kFp64:
+      return Datum(lit.fp64());
+
+    case st::Expression::Literal::kString:
+      return Datum(lit.string());
+    case st::Expression::Literal::kBinary:
+      return 
Datum(std::make_shared<BinaryScalar>(Buffer::FromString(lit.binary())));
+
+    case st::Expression::Literal::kTimestamp:
+      return Datum(std::make_shared<TimestampScalar>(
+          static_cast<int64_t>(lit.timestamp()), TimeUnit::MICRO));
+
+    case st::Expression::Literal::kTimestampTz:
+      return Datum(std::make_shared<TimestampScalar>(
+          static_cast<int64_t>(lit.timestamp_tz()), TimeUnit::MICRO,
+          TimestampTzTimezoneString()));
+
+    case st::Expression::Literal::kDate:
+      return 
Datum(std::make_shared<Date32Scalar>(static_cast<int32_t>(lit.date())));
+    case st::Expression::Literal::kTime:
+      return 
Datum(std::make_shared<Time64Scalar>(static_cast<int64_t>(lit.time()),
+                                                  TimeUnit::MICRO));
+
+    case st::Expression::Literal::kIntervalYearToMonth:
+    case st::Expression::Literal::kIntervalDayToSecond:

Review comment:
       Are these `TODO`s, or is there no `Datum` support for the corresponding 
Arrow types?

##########
File path: cpp/src/arrow/engine/substrait/expression_internal.cc
##########
@@ -0,0 +1,427 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// This API is EXPERIMENTAL.
+
+#include "arrow/engine/substrait/expression_internal.h"
+
+#include <utility>
+
+#include "arrow/builder.h"
+#include "arrow/compute/exec/expression.h"
+#include "arrow/engine/substrait/extension_types.h"
+#include "arrow/engine/substrait/type_internal.h"
+#include "arrow/engine/visibility.h"
+#include "arrow/result.h"
+#include "arrow/status.h"
+
+#include "arrow/util/make_unique.h"
+#include "generated/substrait/expression.pb.h"  // IWYU pragma: export
+
+namespace st = io::substrait;
+
+namespace arrow {
+namespace engine {
+namespace {
+
+std::shared_ptr<FixedSizeBinaryScalar> FixedSizeBinaryScalarFromBytes(
+    const std::string& bytes) {
+  auto buf = Buffer::FromString(bytes);
+  auto type = fixed_size_binary(static_cast<int>(buf->size()));
+  return std::make_shared<FixedSizeBinaryScalar>(std::move(buf), 
std::move(type));
+}
+
+}  // namespace
+
+Result<compute::Expression> FromProto(const st::Expression& expr) {
+  switch (expr.rex_type_case()) {
+    case st::Expression::kLiteral: {
+      ARROW_ASSIGN_OR_RAISE(auto datum, FromProto(expr.literal()));
+      return compute::literal(std::move(datum));
+    }
+
+    default:
+      break;
+  }
+
+  return Status::NotImplemented("conversion to arrow::compute::Expression from 
",
+                                expr.DebugString());
+}
+
+Result<Datum> FromProto(const st::Expression::Literal& lit) {
+  switch (lit.literal_type_case()) {
+    case st::Expression::Literal::kBoolean:
+      return Datum(lit.boolean());
+
+    case st::Expression::Literal::kI8:
+      return Datum(static_cast<int8_t>(lit.i8()));
+    case st::Expression::Literal::kI16:
+      return Datum(static_cast<int16_t>(lit.i16()));
+    case st::Expression::Literal::kI32:
+      return Datum(static_cast<int32_t>(lit.i32()));
+    case st::Expression::Literal::kI64:
+      return Datum(static_cast<int64_t>(lit.i64()));
+
+    case st::Expression::Literal::kFp32:
+      return Datum(lit.fp32());
+    case st::Expression::Literal::kFp64:
+      return Datum(lit.fp64());
+
+    case st::Expression::Literal::kString:
+      return Datum(lit.string());
+    case st::Expression::Literal::kBinary:
+      return 
Datum(std::make_shared<BinaryScalar>(Buffer::FromString(lit.binary())));
+
+    case st::Expression::Literal::kTimestamp:
+      return Datum(std::make_shared<TimestampScalar>(
+          static_cast<int64_t>(lit.timestamp()), TimeUnit::MICRO));
+
+    case st::Expression::Literal::kTimestampTz:
+      return Datum(std::make_shared<TimestampScalar>(
+          static_cast<int64_t>(lit.timestamp_tz()), TimeUnit::MICRO,
+          TimestampTzTimezoneString()));
+
+    case st::Expression::Literal::kDate:
+      return 
Datum(std::make_shared<Date32Scalar>(static_cast<int32_t>(lit.date())));
+    case st::Expression::Literal::kTime:
+      return 
Datum(std::make_shared<Time64Scalar>(static_cast<int64_t>(lit.time()),
+                                                  TimeUnit::MICRO));
+
+    case st::Expression::Literal::kIntervalYearToMonth:
+    case st::Expression::Literal::kIntervalDayToSecond:
+      break;
+
+    case st::Expression::Literal::kUuid:
+      return Datum(std::make_shared<ExtensionScalar>(
+          FixedSizeBinaryScalarFromBytes(lit.uuid()), uuid()));
+
+    case st::Expression::Literal::kFixedChar:
+      return Datum(std::make_shared<ExtensionScalar>(
+          FixedSizeBinaryScalarFromBytes(lit.fixed_char()),
+          fixed_char(static_cast<int32_t>(lit.fixed_char().size()))));
+
+    case st::Expression::Literal::kVarChar:
+      // FIXME
+      // There's no way to determine VarChar.length from the literal
+      break;
+
+    case st::Expression::Literal::kFixedBinary:
+      return Datum(FixedSizeBinaryScalarFromBytes(lit.fixed_char()));
+
+    case st::Expression::Literal::kDecimal:
+      if (lit.decimal().size() != sizeof(Decimal128)) {
+        return Status::Invalid("Decimal literal had ", lit.decimal().size(),
+                               " bytes (expected ", sizeof(Decimal128), ")");
+      }
+
+      // FIXME
+      // It's not clear how these bytes should be interpreted...
+      // Furthermore, there's no way to determine scale or precision
+      break;
+
+    case st::Expression::Literal::kStruct: {
+      const auto& struct_ = lit.struct_();
+
+      ScalarVector fields(struct_.fields_size());
+      std::vector<std::string> field_names(fields.size(), "");
+      for (size_t i = 0; i < fields.size(); ++i) {
+        ARROW_ASSIGN_OR_RAISE(auto field, FromProto(struct_.fields(i)));
+        DCHECK(field.is_scalar());
+        fields.push_back(field.scalar());
+      }
+      ARROW_ASSIGN_OR_RAISE(
+          auto scalar, StructScalar::Make(std::move(fields), 
std::move(field_names)));
+      return Datum(std::move(scalar));
+    }
+
+      // case st::Expression::Literal::kNamedStruct:
+
+    case st::Expression::Literal::kList: {
+      const auto& list = lit.list();
+
+      // FIXME
+      // No way to determine list value type for empty list literals
+      DCHECK_NE(list.values_size(), 0);
+
+      ScalarVector values(list.values_size());
+      for (size_t i = 0; i < values.size(); ++i) {
+        ARROW_ASSIGN_OR_RAISE(auto value, FromProto(list.values(i)));
+        DCHECK(value.is_scalar());
+        values.push_back(value.scalar());
+      }
+
+      ARROW_ASSIGN_OR_RAISE(auto builder, MakeBuilder(values[0]->type));
+      RETURN_NOT_OK(builder->AppendScalars(values));
+      ARROW_ASSIGN_OR_RAISE(auto arr, builder->Finish());
+      return Datum(std::make_shared<ListScalar>(std::move(arr)));
+    }
+
+    case st::Expression::Literal::kMap: {
+      const auto& map = lit.map();
+
+      // FIXME
+      // No way to determine list value type for empty list literals
+      DCHECK_NE(map.key_values_size(), 0);
+
+      ScalarVector keys(map.key_values_size()), values(map.key_values_size());
+      for (size_t i = 0; i < values.size(); ++i) {
+        const auto& kv = map.key_values(i);
+
+        static const std::array<char const*, 4> kMissing = {"key and value", 
"value",
+                                                            "key", nullptr};
+        if (auto missing = kMissing[kv.has_key() + kv.has_value() * 2]) {
+          return Status::Invalid("While converting to MapScalar encountered 
missing ",
+                                 missing, " in ", map.DebugString());
+        }
+        ARROW_ASSIGN_OR_RAISE(auto key, FromProto(kv.key()));
+        ARROW_ASSIGN_OR_RAISE(auto value, FromProto(kv.value()));
+
+        DCHECK(key.is_scalar());
+        DCHECK(value.is_scalar());
+
+        keys.push_back(key.scalar());
+        values.push_back(value.scalar());
+      }
+
+      ARROW_ASSIGN_OR_RAISE(auto key_builder, MakeBuilder(keys[0]->type));
+      ARROW_ASSIGN_OR_RAISE(auto value_builder, MakeBuilder(keys[0]->type));
+      RETURN_NOT_OK(key_builder->AppendScalars(keys));
+      RETURN_NOT_OK(value_builder->AppendScalars(values));
+      ARROW_ASSIGN_OR_RAISE(auto key_arr, key_builder->Finish());
+      ARROW_ASSIGN_OR_RAISE(auto value_arr, value_builder->Finish());
+      ARROW_ASSIGN_OR_RAISE(
+          auto kv_arr,
+          StructArray::Make(ArrayVector{std::move(key_arr), 
std::move(value_arr)},
+                            std::vector<std::string>{"key", "value"}));
+      return Datum(std::make_shared<MapScalar>(std::move(kv_arr)));
+    }
+
+    case st::Expression::Literal::kNull: {
+      ARROW_ASSIGN_OR_RAISE(auto type_nullable, FromProto(lit.null()));
+      if (!type_nullable.second) {
+        return Status::Invalid("Null literal ", lit.DebugString(),
+                               " is of non-nullable type");

Review comment:
       What a pathological case :grimacing: 

##########
File path: cpp/src/arrow/engine/substrait/extension_types.cc
##########
@@ -0,0 +1,99 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/engine/substrait/extension_types.h"
+
+#include "arrow/engine/simple_extension_type_internal.h"
+
+namespace arrow {
+namespace engine {
+namespace {
+
+constexpr util::string_view kUuidExtensionName = "uuid";
+struct UuidExtensionParams {};
+std::shared_ptr<DataType> UuidGetStorage(const UuidExtensionParams&) {
+  return fixed_size_binary(16);
+}
+static auto kUuidExtensionParamsProperties = internal::MakeProperties();
+
+using UuidType = SimpleExtensionType<kUuidExtensionName, UuidExtensionParams,
+                                     decltype(kUuidExtensionParamsProperties),
+                                     kUuidExtensionParamsProperties, 
UuidGetStorage>;
+
+constexpr util::string_view kFixedCharExtensionName = "fixed_char";
+struct FixedCharExtensionParams {
+  int32_t length;
+};
+std::shared_ptr<DataType> FixedCharGetStorage(const FixedCharExtensionParams& 
params) {
+  return fixed_size_binary(params.length);
+}
+static auto kFixedCharExtensionParamsProperties = internal::MakeProperties(
+    internal::DataMember("length", &FixedCharExtensionParams::length));
+
+using FixedCharType =
+    SimpleExtensionType<kFixedCharExtensionName, FixedCharExtensionParams,
+                        decltype(kFixedCharExtensionParamsProperties),
+                        kFixedCharExtensionParamsProperties, 
FixedCharGetStorage>;
+
+constexpr util::string_view kVarCharExtensionName = "varchar";
+struct VarCharExtensionParams {
+  int32_t length;
+};
+std::shared_ptr<DataType> VarCharGetStorage(const VarCharExtensionParams&) {
+  return utf8();
+}
+static auto kVarCharExtensionParamsProperties = internal::MakeProperties(
+    internal::DataMember("length", &VarCharExtensionParams::length));
+
+using VarCharType =
+    SimpleExtensionType<kVarCharExtensionName, VarCharExtensionParams,
+                        decltype(kVarCharExtensionParamsProperties),
+                        kVarCharExtensionParamsProperties, VarCharGetStorage>;
+
+}  // namespace
+
+std::shared_ptr<DataType> uuid() { return UuidType::Make({}); }
+
+std::shared_ptr<DataType> fixed_char(int32_t length) {
+  return FixedCharType::Make({length});
+}
+
+std::shared_ptr<DataType> varchar(int32_t length) { return 
VarCharType::Make({length}); }
+
+bool UnwrapUuid(const DataType& t) {
+  if (auto params = UuidType::GetIf(t)) {
+    return true;
+  }
+  return false;
+}
+
+util::optional<int32_t> UnwrapFixedChar(const DataType& t) {
+  if (auto params = FixedCharType::GetIf(t)) {
+    return params->length;
+  }
+  return util::nullopt;
+}
+
+util::optional<int32_t> UnwrapVarChar(const DataType& t) {

Review comment:
       Does it make sense to unify the return type here by returning  `params`?

##########
File path: cpp/src/arrow/engine/substrait/type_internal.cc
##########
@@ -0,0 +1,434 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/engine/substrait/type_internal.h"
+
+#include <string>
+#include <vector>
+
+#include "arrow/buffer.h"
+#include "arrow/engine/substrait/extension_types.h"
+#include "arrow/result.h"
+#include "arrow/status.h"
+#include "arrow/type.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/make_unique.h"
+#include "arrow/visitor_inline.h"
+
+namespace arrow {
+namespace engine {
+namespace {
+
+template <typename TypeMessage>
+Status CheckVariation(const TypeMessage& type) {
+  if (!type.has_variation()) return Status::OK();
+  return Status::NotImplemented("Type.Variation for ", type.DebugString());
+}
+
+template <typename TypeMessage>
+bool IsNullable(const TypeMessage& type) {
+  return type.nullability() == st::Type_Nullability_NULLABLE;
+}
+
+template <typename ArrowType, typename TypeMessage, typename... A>
+Result<std::pair<std::shared_ptr<DataType>, bool>> FromProtoImpl(const 
TypeMessage& type,
+                                                                 A&&... args) {
+  RETURN_NOT_OK(CheckVariation(type));
+
+  return std::make_pair(std::static_pointer_cast<DataType>(
+                            
std::make_shared<ArrowType>(std::forward<A>(args)...)),
+                        IsNullable(type));
+}
+
+template <typename TypeMessage, typename... A>
+Result<std::pair<std::shared_ptr<DataType>, bool>> FromProtoImpl(
+    const TypeMessage& type, std::shared_ptr<DataType> type_factory(A...), 
A&&... args) {
+  RETURN_NOT_OK(CheckVariation(type));
+
+  return std::make_pair(
+      
std::static_pointer_cast<DataType>(type_factory(std::forward<A>(args)...)),
+      IsNullable(type));
+}
+
+template <typename Types, typename Names = const std::string*>
+Result<FieldVector> FieldsFromProto(
+    int size, const Types& types,
+    const Names* names = static_cast<const Names*>(nullptr)) {
+  FieldVector fields(size);
+  for (int i = 0; i < size; ++i) {
+    ARROW_ASSIGN_OR_RAISE(auto type_nullable, FromProto(types[i]));
+
+    std::string name = names ? std::move((*names)[i]) : "";
+    fields[i] =
+        field(std::move(name), std::move(type_nullable.first), 
type_nullable.second);
+  }
+  return fields;
+}
+
+}  // namespace
+
+Result<std::pair<std::shared_ptr<DataType>, bool>> FromProto(const st::Type& 
type) {
+  switch (type.kind_case()) {
+    case st::Type::kBool:
+      return FromProtoImpl<BooleanType>(type.bool_());
+
+    case st::Type::kI8:
+      return FromProtoImpl<Int8Type>(type.i8());
+    case st::Type::kI16:
+      return FromProtoImpl<Int16Type>(type.i16());
+    case st::Type::kI32:
+      return FromProtoImpl<Int32Type>(type.i32());
+    case st::Type::kI64:
+      return FromProtoImpl<Int64Type>(type.i64());
+
+    case st::Type::kFp32:
+      return FromProtoImpl<FloatType>(type.fp32());
+    case st::Type::kFp64:
+      return FromProtoImpl<DoubleType>(type.fp64());
+
+    case st::Type::kString:
+      return FromProtoImpl<StringType>(type.string());
+    case st::Type::kBinary:
+      return FromProtoImpl<BinaryType>(type.binary());
+
+    case st::Type::kTimestamp:
+      return FromProtoImpl<TimestampType>(type.timestamp(), TimeUnit::MICRO);
+    case st::Type::kTimestampTz:
+      return FromProtoImpl<TimestampType>(type.timestamp_tz(), TimeUnit::MICRO,
+                                          TimestampTzTimezoneString());
+    case st::Type::kDate:
+      // FIXME
+      // Substrait uses uint32_t to store dates, and further restricts the 
allowed
+      // range of dates to [1000-01-01..9999-12-31]. Does this mean the value 
should
+      // be interpreted as an offset from 1000-01-01 instead of the epoch? Or 
should
+      // the value be signed instead?
+      // Furthermore, simple_logical_types.md states that the equivalent arrow 
type
+      // is Date64 (which measures milliseconds rather than days). Is that 
incorrect?
+      return FromProtoImpl<Date32Type>(type.date());
+
+    case st::Type::kTime:
+      return FromProtoImpl<Time64Type>(type.time(), TimeUnit::MICRO);
+
+    case st::Type::kIntervalYear:
+      // FIXME
+      // None of MonthIntervalType, DayTimeIntervalType, 
MonthDayNanoIntervalType
+      // corresponds; none has a year field. Lossy conversion to 
MonthIntervalType
+      // would be possible...
+      break;
+
+    case st::Type::kIntervalDay:
+      // FIXME
+      // Documentation is inconsistent; the precision of the sub-day interval 
is
+      // described as microsecond in simple_logical_types.md but 
IntervalDayToSecond has
+      // the field `int32 seconds`. At microsecond precision it's minimally 
necessary to
+      // store all values in the range `[0,24*60*60*1000_000)` in order to 
express all
+      // possible sub-day intervals, but this is not possible for 32 bit 
integers.

Review comment:
       Yeah this is underspecified right now.

##########
File path: cpp/src/arrow/engine/substrait/type_internal.cc
##########
@@ -0,0 +1,434 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/engine/substrait/type_internal.h"
+
+#include <string>
+#include <vector>
+
+#include "arrow/buffer.h"
+#include "arrow/engine/substrait/extension_types.h"
+#include "arrow/result.h"
+#include "arrow/status.h"
+#include "arrow/type.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/make_unique.h"
+#include "arrow/visitor_inline.h"
+
+namespace arrow {
+namespace engine {
+namespace {
+
+template <typename TypeMessage>
+Status CheckVariation(const TypeMessage& type) {
+  if (!type.has_variation()) return Status::OK();
+  return Status::NotImplemented("Type.Variation for ", type.DebugString());
+}
+
+template <typename TypeMessage>
+bool IsNullable(const TypeMessage& type) {
+  return type.nullability() == st::Type_Nullability_NULLABLE;
+}
+
+template <typename ArrowType, typename TypeMessage, typename... A>
+Result<std::pair<std::shared_ptr<DataType>, bool>> FromProtoImpl(const 
TypeMessage& type,
+                                                                 A&&... args) {
+  RETURN_NOT_OK(CheckVariation(type));
+
+  return std::make_pair(std::static_pointer_cast<DataType>(
+                            
std::make_shared<ArrowType>(std::forward<A>(args)...)),
+                        IsNullable(type));
+}
+
+template <typename TypeMessage, typename... A>
+Result<std::pair<std::shared_ptr<DataType>, bool>> FromProtoImpl(
+    const TypeMessage& type, std::shared_ptr<DataType> type_factory(A...), 
A&&... args) {
+  RETURN_NOT_OK(CheckVariation(type));
+
+  return std::make_pair(
+      
std::static_pointer_cast<DataType>(type_factory(std::forward<A>(args)...)),
+      IsNullable(type));
+}
+
+template <typename Types, typename Names = const std::string*>
+Result<FieldVector> FieldsFromProto(
+    int size, const Types& types,
+    const Names* names = static_cast<const Names*>(nullptr)) {
+  FieldVector fields(size);
+  for (int i = 0; i < size; ++i) {
+    ARROW_ASSIGN_OR_RAISE(auto type_nullable, FromProto(types[i]));
+
+    std::string name = names ? std::move((*names)[i]) : "";

Review comment:
       Does this function need a check to ensure that `size` and `Names` match? 
Can we avoid the possible nullptr derefence by using a `std::vector` here?

##########
File path: cpp/src/arrow/engine/substrait/expression_internal.cc
##########
@@ -0,0 +1,427 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// This API is EXPERIMENTAL.
+
+#include "arrow/engine/substrait/expression_internal.h"
+
+#include <utility>
+
+#include "arrow/builder.h"
+#include "arrow/compute/exec/expression.h"
+#include "arrow/engine/substrait/extension_types.h"
+#include "arrow/engine/substrait/type_internal.h"
+#include "arrow/engine/visibility.h"
+#include "arrow/result.h"
+#include "arrow/status.h"
+
+#include "arrow/util/make_unique.h"
+#include "generated/substrait/expression.pb.h"  // IWYU pragma: export
+
+namespace st = io::substrait;
+
+namespace arrow {
+namespace engine {
+namespace {
+
+std::shared_ptr<FixedSizeBinaryScalar> FixedSizeBinaryScalarFromBytes(
+    const std::string& bytes) {
+  auto buf = Buffer::FromString(bytes);
+  auto type = fixed_size_binary(static_cast<int>(buf->size()));
+  return std::make_shared<FixedSizeBinaryScalar>(std::move(buf), 
std::move(type));
+}
+
+}  // namespace
+
+Result<compute::Expression> FromProto(const st::Expression& expr) {
+  switch (expr.rex_type_case()) {
+    case st::Expression::kLiteral: {
+      ARROW_ASSIGN_OR_RAISE(auto datum, FromProto(expr.literal()));
+      return compute::literal(std::move(datum));
+    }
+
+    default:
+      break;
+  }
+
+  return Status::NotImplemented("conversion to arrow::compute::Expression from 
",
+                                expr.DebugString());
+}
+
+Result<Datum> FromProto(const st::Expression::Literal& lit) {
+  switch (lit.literal_type_case()) {
+    case st::Expression::Literal::kBoolean:
+      return Datum(lit.boolean());
+
+    case st::Expression::Literal::kI8:
+      return Datum(static_cast<int8_t>(lit.i8()));
+    case st::Expression::Literal::kI16:
+      return Datum(static_cast<int16_t>(lit.i16()));
+    case st::Expression::Literal::kI32:
+      return Datum(static_cast<int32_t>(lit.i32()));
+    case st::Expression::Literal::kI64:
+      return Datum(static_cast<int64_t>(lit.i64()));
+
+    case st::Expression::Literal::kFp32:
+      return Datum(lit.fp32());
+    case st::Expression::Literal::kFp64:
+      return Datum(lit.fp64());
+
+    case st::Expression::Literal::kString:
+      return Datum(lit.string());
+    case st::Expression::Literal::kBinary:
+      return 
Datum(std::make_shared<BinaryScalar>(Buffer::FromString(lit.binary())));
+
+    case st::Expression::Literal::kTimestamp:
+      return Datum(std::make_shared<TimestampScalar>(
+          static_cast<int64_t>(lit.timestamp()), TimeUnit::MICRO));
+
+    case st::Expression::Literal::kTimestampTz:
+      return Datum(std::make_shared<TimestampScalar>(
+          static_cast<int64_t>(lit.timestamp_tz()), TimeUnit::MICRO,
+          TimestampTzTimezoneString()));
+
+    case st::Expression::Literal::kDate:
+      return 
Datum(std::make_shared<Date32Scalar>(static_cast<int32_t>(lit.date())));
+    case st::Expression::Literal::kTime:
+      return 
Datum(std::make_shared<Time64Scalar>(static_cast<int64_t>(lit.time()),
+                                                  TimeUnit::MICRO));
+
+    case st::Expression::Literal::kIntervalYearToMonth:
+    case st::Expression::Literal::kIntervalDayToSecond:
+      break;
+
+    case st::Expression::Literal::kUuid:
+      return Datum(std::make_shared<ExtensionScalar>(
+          FixedSizeBinaryScalarFromBytes(lit.uuid()), uuid()));
+
+    case st::Expression::Literal::kFixedChar:
+      return Datum(std::make_shared<ExtensionScalar>(
+          FixedSizeBinaryScalarFromBytes(lit.fixed_char()),
+          fixed_char(static_cast<int32_t>(lit.fixed_char().size()))));
+
+    case st::Expression::Literal::kVarChar:
+      // FIXME
+      // There's no way to determine VarChar.length from the literal
+      break;
+
+    case st::Expression::Literal::kFixedBinary:
+      return Datum(FixedSizeBinaryScalarFromBytes(lit.fixed_char()));
+
+    case st::Expression::Literal::kDecimal:
+      if (lit.decimal().size() != sizeof(Decimal128)) {
+        return Status::Invalid("Decimal literal had ", lit.decimal().size(),
+                               " bytes (expected ", sizeof(Decimal128), ")");
+      }
+
+      // FIXME
+      // It's not clear how these bytes should be interpreted...

Review comment:
       https://github.com/substrait-io/substrait/issues/84

##########
File path: cpp/src/arrow/array/builder_base.h
##########
@@ -286,13 +287,27 @@ ARROW_EXPORT
 Status MakeBuilder(MemoryPool* pool, const std::shared_ptr<DataType>& type,
                    std::unique_ptr<ArrayBuilder>* out);
 
+inline Result<std::unique_ptr<ArrayBuilder>> MakeBuilder(

Review comment:
       Guessing these were needed during the development of this PR?

##########
File path: cpp/src/arrow/engine/substrait/expression_internal.cc
##########
@@ -0,0 +1,427 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// This API is EXPERIMENTAL.
+
+#include "arrow/engine/substrait/expression_internal.h"
+
+#include <utility>
+
+#include "arrow/builder.h"
+#include "arrow/compute/exec/expression.h"
+#include "arrow/engine/substrait/extension_types.h"
+#include "arrow/engine/substrait/type_internal.h"
+#include "arrow/engine/visibility.h"
+#include "arrow/result.h"
+#include "arrow/status.h"
+
+#include "arrow/util/make_unique.h"
+#include "generated/substrait/expression.pb.h"  // IWYU pragma: export
+
+namespace st = io::substrait;
+
+namespace arrow {
+namespace engine {
+namespace {
+
+std::shared_ptr<FixedSizeBinaryScalar> FixedSizeBinaryScalarFromBytes(
+    const std::string& bytes) {
+  auto buf = Buffer::FromString(bytes);
+  auto type = fixed_size_binary(static_cast<int>(buf->size()));
+  return std::make_shared<FixedSizeBinaryScalar>(std::move(buf), 
std::move(type));
+}
+
+}  // namespace
+
+Result<compute::Expression> FromProto(const st::Expression& expr) {
+  switch (expr.rex_type_case()) {
+    case st::Expression::kLiteral: {
+      ARROW_ASSIGN_OR_RAISE(auto datum, FromProto(expr.literal()));
+      return compute::literal(std::move(datum));
+    }
+
+    default:
+      break;
+  }
+
+  return Status::NotImplemented("conversion to arrow::compute::Expression from 
",
+                                expr.DebugString());
+}
+
+Result<Datum> FromProto(const st::Expression::Literal& lit) {
+  switch (lit.literal_type_case()) {
+    case st::Expression::Literal::kBoolean:
+      return Datum(lit.boolean());
+
+    case st::Expression::Literal::kI8:
+      return Datum(static_cast<int8_t>(lit.i8()));
+    case st::Expression::Literal::kI16:
+      return Datum(static_cast<int16_t>(lit.i16()));
+    case st::Expression::Literal::kI32:
+      return Datum(static_cast<int32_t>(lit.i32()));
+    case st::Expression::Literal::kI64:
+      return Datum(static_cast<int64_t>(lit.i64()));
+
+    case st::Expression::Literal::kFp32:
+      return Datum(lit.fp32());
+    case st::Expression::Literal::kFp64:
+      return Datum(lit.fp64());
+
+    case st::Expression::Literal::kString:
+      return Datum(lit.string());
+    case st::Expression::Literal::kBinary:
+      return 
Datum(std::make_shared<BinaryScalar>(Buffer::FromString(lit.binary())));
+
+    case st::Expression::Literal::kTimestamp:
+      return Datum(std::make_shared<TimestampScalar>(
+          static_cast<int64_t>(lit.timestamp()), TimeUnit::MICRO));
+
+    case st::Expression::Literal::kTimestampTz:
+      return Datum(std::make_shared<TimestampScalar>(
+          static_cast<int64_t>(lit.timestamp_tz()), TimeUnit::MICRO,
+          TimestampTzTimezoneString()));
+
+    case st::Expression::Literal::kDate:
+      return 
Datum(std::make_shared<Date32Scalar>(static_cast<int32_t>(lit.date())));
+    case st::Expression::Literal::kTime:
+      return 
Datum(std::make_shared<Time64Scalar>(static_cast<int64_t>(lit.time()),
+                                                  TimeUnit::MICRO));
+
+    case st::Expression::Literal::kIntervalYearToMonth:
+    case st::Expression::Literal::kIntervalDayToSecond:
+      break;
+
+    case st::Expression::Literal::kUuid:
+      return Datum(std::make_shared<ExtensionScalar>(
+          FixedSizeBinaryScalarFromBytes(lit.uuid()), uuid()));
+
+    case st::Expression::Literal::kFixedChar:
+      return Datum(std::make_shared<ExtensionScalar>(

Review comment:
       `ExtensionScalar` looks friggin' neat.

##########
File path: cpp/src/arrow/engine/substrait/serde.cc
##########
@@ -0,0 +1,109 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/engine/substrait/serde.h"
+
+#include "arrow/engine/substrait/expression_internal.h"
+#include "arrow/engine/substrait/type_internal.h"
+#include "arrow/util/string_view.h"
+#include "google/protobuf/io/zero_copy_stream_impl_lite.h"
+#include "google/protobuf/message.h"
+
+#include "generated/substrait/plan.pb.h"
+
+namespace google {
+namespace protobuf {
+
+class Message;
+
+}  // namespace protobuf
+}  // namespace google
+
+namespace arrow {
+namespace engine {
+
+Status ParseFromBufferImpl(const Buffer& buf, const std::string& full_name,
+                           google::protobuf::Message* message) {
+  google::protobuf::io::ArrayInputStream buf_stream{buf.data(),
+                                                    
static_cast<int>(buf.size())};
+
+  if (message->ParseFromZeroCopyStream(&buf_stream)) {
+    return Status::OK();
+  }
+  return Status::IOError("ParseFromZeroCopyStream failed for ", full_name);
+}
+
+template <typename Message>
+Result<Message> ParseFromBuffer(const Buffer& buf) {
+  Message message;
+  ARROW_RETURN_NOT_OK(
+      ParseFromBufferImpl(buf, Message::descriptor()->full_name(), &message));
+  return message;
+}
+
+Result<compute::Declaration> Convert(const st::Rel& relation) {
+  return Status::NotImplemented("");
+}
+
+Result<std::vector<compute::Declaration>> ConvertPlan(const Buffer& buf) {
+  ARROW_ASSIGN_OR_RAISE(auto plan, ParseFromBuffer<st::Plan>(buf));
+
+  std::vector<compute::Declaration> decls;
+  for (const auto& relation : plan.relations()) {
+    ARROW_ASSIGN_OR_RAISE(auto decl, Convert(relation));
+    decls.push_back(std::move(decl));
+  }
+
+  return decls;
+}
+
+Result<std::shared_ptr<Schema>> DeserializeSchema(const Buffer& buf) {
+  ARROW_ASSIGN_OR_RAISE(auto named_struct, 
ParseFromBuffer<st::Type::NamedStruct>(buf));
+  return FromProto(named_struct);

Review comment:
       Not sure if you've implemented it yet, but the transformation here 
requires a bit more than a simple traversal of the type tree due to the way 
substrait represents field names. Field names are in depth first order so one 
possibility is to use a queue that gets pass around and popped whenever field 
names are needed to construct a type.

##########
File path: cpp/src/arrow/engine/substrait/expression_internal.cc
##########
@@ -0,0 +1,427 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// This API is EXPERIMENTAL.
+
+#include "arrow/engine/substrait/expression_internal.h"
+
+#include <utility>
+
+#include "arrow/builder.h"
+#include "arrow/compute/exec/expression.h"
+#include "arrow/engine/substrait/extension_types.h"
+#include "arrow/engine/substrait/type_internal.h"
+#include "arrow/engine/visibility.h"
+#include "arrow/result.h"
+#include "arrow/status.h"
+
+#include "arrow/util/make_unique.h"
+#include "generated/substrait/expression.pb.h"  // IWYU pragma: export
+
+namespace st = io::substrait;
+
+namespace arrow {
+namespace engine {
+namespace {
+
+std::shared_ptr<FixedSizeBinaryScalar> FixedSizeBinaryScalarFromBytes(
+    const std::string& bytes) {
+  auto buf = Buffer::FromString(bytes);
+  auto type = fixed_size_binary(static_cast<int>(buf->size()));
+  return std::make_shared<FixedSizeBinaryScalar>(std::move(buf), 
std::move(type));
+}
+
+}  // namespace
+
+Result<compute::Expression> FromProto(const st::Expression& expr) {
+  switch (expr.rex_type_case()) {
+    case st::Expression::kLiteral: {
+      ARROW_ASSIGN_OR_RAISE(auto datum, FromProto(expr.literal()));
+      return compute::literal(std::move(datum));
+    }
+
+    default:
+      break;
+  }
+
+  return Status::NotImplemented("conversion to arrow::compute::Expression from 
",
+                                expr.DebugString());
+}
+
+Result<Datum> FromProto(const st::Expression::Literal& lit) {
+  switch (lit.literal_type_case()) {
+    case st::Expression::Literal::kBoolean:
+      return Datum(lit.boolean());
+
+    case st::Expression::Literal::kI8:
+      return Datum(static_cast<int8_t>(lit.i8()));
+    case st::Expression::Literal::kI16:
+      return Datum(static_cast<int16_t>(lit.i16()));
+    case st::Expression::Literal::kI32:
+      return Datum(static_cast<int32_t>(lit.i32()));
+    case st::Expression::Literal::kI64:
+      return Datum(static_cast<int64_t>(lit.i64()));
+
+    case st::Expression::Literal::kFp32:
+      return Datum(lit.fp32());
+    case st::Expression::Literal::kFp64:
+      return Datum(lit.fp64());
+
+    case st::Expression::Literal::kString:
+      return Datum(lit.string());
+    case st::Expression::Literal::kBinary:
+      return 
Datum(std::make_shared<BinaryScalar>(Buffer::FromString(lit.binary())));
+
+    case st::Expression::Literal::kTimestamp:
+      return Datum(std::make_shared<TimestampScalar>(
+          static_cast<int64_t>(lit.timestamp()), TimeUnit::MICRO));
+
+    case st::Expression::Literal::kTimestampTz:
+      return Datum(std::make_shared<TimestampScalar>(
+          static_cast<int64_t>(lit.timestamp_tz()), TimeUnit::MICRO,
+          TimestampTzTimezoneString()));
+
+    case st::Expression::Literal::kDate:
+      return 
Datum(std::make_shared<Date32Scalar>(static_cast<int32_t>(lit.date())));
+    case st::Expression::Literal::kTime:
+      return 
Datum(std::make_shared<Time64Scalar>(static_cast<int64_t>(lit.time()),
+                                                  TimeUnit::MICRO));
+
+    case st::Expression::Literal::kIntervalYearToMonth:
+    case st::Expression::Literal::kIntervalDayToSecond:
+      break;
+
+    case st::Expression::Literal::kUuid:
+      return Datum(std::make_shared<ExtensionScalar>(
+          FixedSizeBinaryScalarFromBytes(lit.uuid()), uuid()));
+
+    case st::Expression::Literal::kFixedChar:
+      return Datum(std::make_shared<ExtensionScalar>(
+          FixedSizeBinaryScalarFromBytes(lit.fixed_char()),
+          fixed_char(static_cast<int32_t>(lit.fixed_char().size()))));
+
+    case st::Expression::Literal::kVarChar:
+      // FIXME
+      // There's no way to determine VarChar.length from the literal

Review comment:
       I don't think this matters, since the length is an upper bound. There's 
no real benefit to returning something other than plain old string here for 
now, because I don't think Arrow is currently capable of representing a 
limited-variable-length string right now, and I personally do not think that is 
important to address any time soon. `string` can represent this just fine.

##########
File path: cpp/src/arrow/engine/substrait/expression_internal.cc
##########
@@ -0,0 +1,427 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// This API is EXPERIMENTAL.
+
+#include "arrow/engine/substrait/expression_internal.h"
+
+#include <utility>
+
+#include "arrow/builder.h"
+#include "arrow/compute/exec/expression.h"
+#include "arrow/engine/substrait/extension_types.h"
+#include "arrow/engine/substrait/type_internal.h"
+#include "arrow/engine/visibility.h"
+#include "arrow/result.h"
+#include "arrow/status.h"
+
+#include "arrow/util/make_unique.h"
+#include "generated/substrait/expression.pb.h"  // IWYU pragma: export
+
+namespace st = io::substrait;
+
+namespace arrow {
+namespace engine {
+namespace {
+
+std::shared_ptr<FixedSizeBinaryScalar> FixedSizeBinaryScalarFromBytes(
+    const std::string& bytes) {
+  auto buf = Buffer::FromString(bytes);
+  auto type = fixed_size_binary(static_cast<int>(buf->size()));
+  return std::make_shared<FixedSizeBinaryScalar>(std::move(buf), 
std::move(type));
+}
+
+}  // namespace
+
+Result<compute::Expression> FromProto(const st::Expression& expr) {
+  switch (expr.rex_type_case()) {
+    case st::Expression::kLiteral: {
+      ARROW_ASSIGN_OR_RAISE(auto datum, FromProto(expr.literal()));
+      return compute::literal(std::move(datum));
+    }
+
+    default:
+      break;
+  }
+
+  return Status::NotImplemented("conversion to arrow::compute::Expression from 
",
+                                expr.DebugString());
+}
+
+Result<Datum> FromProto(const st::Expression::Literal& lit) {
+  switch (lit.literal_type_case()) {
+    case st::Expression::Literal::kBoolean:
+      return Datum(lit.boolean());
+
+    case st::Expression::Literal::kI8:
+      return Datum(static_cast<int8_t>(lit.i8()));
+    case st::Expression::Literal::kI16:
+      return Datum(static_cast<int16_t>(lit.i16()));
+    case st::Expression::Literal::kI32:
+      return Datum(static_cast<int32_t>(lit.i32()));
+    case st::Expression::Literal::kI64:
+      return Datum(static_cast<int64_t>(lit.i64()));
+
+    case st::Expression::Literal::kFp32:
+      return Datum(lit.fp32());
+    case st::Expression::Literal::kFp64:
+      return Datum(lit.fp64());
+
+    case st::Expression::Literal::kString:
+      return Datum(lit.string());
+    case st::Expression::Literal::kBinary:
+      return 
Datum(std::make_shared<BinaryScalar>(Buffer::FromString(lit.binary())));
+
+    case st::Expression::Literal::kTimestamp:
+      return Datum(std::make_shared<TimestampScalar>(
+          static_cast<int64_t>(lit.timestamp()), TimeUnit::MICRO));
+
+    case st::Expression::Literal::kTimestampTz:
+      return Datum(std::make_shared<TimestampScalar>(
+          static_cast<int64_t>(lit.timestamp_tz()), TimeUnit::MICRO,
+          TimestampTzTimezoneString()));
+
+    case st::Expression::Literal::kDate:
+      return 
Datum(std::make_shared<Date32Scalar>(static_cast<int32_t>(lit.date())));
+    case st::Expression::Literal::kTime:
+      return 
Datum(std::make_shared<Time64Scalar>(static_cast<int64_t>(lit.time()),
+                                                  TimeUnit::MICRO));
+
+    case st::Expression::Literal::kIntervalYearToMonth:
+    case st::Expression::Literal::kIntervalDayToSecond:
+      break;
+
+    case st::Expression::Literal::kUuid:
+      return Datum(std::make_shared<ExtensionScalar>(
+          FixedSizeBinaryScalarFromBytes(lit.uuid()), uuid()));
+
+    case st::Expression::Literal::kFixedChar:
+      return Datum(std::make_shared<ExtensionScalar>(
+          FixedSizeBinaryScalarFromBytes(lit.fixed_char()),
+          fixed_char(static_cast<int32_t>(lit.fixed_char().size()))));
+
+    case st::Expression::Literal::kVarChar:
+      // FIXME
+      // There's no way to determine VarChar.length from the literal
+      break;
+
+    case st::Expression::Literal::kFixedBinary:
+      return Datum(FixedSizeBinaryScalarFromBytes(lit.fixed_char()));
+
+    case st::Expression::Literal::kDecimal:
+      if (lit.decimal().size() != sizeof(Decimal128)) {
+        return Status::Invalid("Decimal literal had ", lit.decimal().size(),
+                               " bytes (expected ", sizeof(Decimal128), ")");
+      }
+
+      // FIXME
+      // It's not clear how these bytes should be interpreted...
+      // Furthermore, there's no way to determine scale or precision
+      break;
+
+    case st::Expression::Literal::kStruct: {
+      const auto& struct_ = lit.struct_();
+
+      ScalarVector fields(struct_.fields_size());
+      std::vector<std::string> field_names(fields.size(), "");
+      for (size_t i = 0; i < fields.size(); ++i) {
+        ARROW_ASSIGN_OR_RAISE(auto field, FromProto(struct_.fields(i)));
+        DCHECK(field.is_scalar());
+        fields.push_back(field.scalar());
+      }
+      ARROW_ASSIGN_OR_RAISE(
+          auto scalar, StructScalar::Make(std::move(fields), 
std::move(field_names)));
+      return Datum(std::move(scalar));
+    }
+
+      // case st::Expression::Literal::kNamedStruct:
+
+    case st::Expression::Literal::kList: {
+      const auto& list = lit.list();
+
+      // FIXME
+      // No way to determine list value type for empty list literals
+      DCHECK_NE(list.values_size(), 0);
+
+      ScalarVector values(list.values_size());
+      for (size_t i = 0; i < values.size(); ++i) {
+        ARROW_ASSIGN_OR_RAISE(auto value, FromProto(list.values(i)));
+        DCHECK(value.is_scalar());
+        values.push_back(value.scalar());
+      }
+
+      ARROW_ASSIGN_OR_RAISE(auto builder, MakeBuilder(values[0]->type));
+      RETURN_NOT_OK(builder->AppendScalars(values));
+      ARROW_ASSIGN_OR_RAISE(auto arr, builder->Finish());
+      return Datum(std::make_shared<ListScalar>(std::move(arr)));
+    }
+
+    case st::Expression::Literal::kMap: {
+      const auto& map = lit.map();
+
+      // FIXME
+      // No way to determine list value type for empty list literals
+      DCHECK_NE(map.key_values_size(), 0);

Review comment:
       Similarly, I'm using the `null` type for the keys/values here as well.

##########
File path: cpp/src/arrow/engine/substrait/expression_internal.cc
##########
@@ -0,0 +1,427 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// This API is EXPERIMENTAL.
+
+#include "arrow/engine/substrait/expression_internal.h"
+
+#include <utility>
+
+#include "arrow/builder.h"
+#include "arrow/compute/exec/expression.h"
+#include "arrow/engine/substrait/extension_types.h"
+#include "arrow/engine/substrait/type_internal.h"
+#include "arrow/engine/visibility.h"
+#include "arrow/result.h"
+#include "arrow/status.h"
+
+#include "arrow/util/make_unique.h"
+#include "generated/substrait/expression.pb.h"  // IWYU pragma: export
+
+namespace st = io::substrait;
+
+namespace arrow {
+namespace engine {
+namespace {
+
+std::shared_ptr<FixedSizeBinaryScalar> FixedSizeBinaryScalarFromBytes(
+    const std::string& bytes) {
+  auto buf = Buffer::FromString(bytes);
+  auto type = fixed_size_binary(static_cast<int>(buf->size()));
+  return std::make_shared<FixedSizeBinaryScalar>(std::move(buf), 
std::move(type));
+}
+
+}  // namespace
+
+Result<compute::Expression> FromProto(const st::Expression& expr) {
+  switch (expr.rex_type_case()) {
+    case st::Expression::kLiteral: {
+      ARROW_ASSIGN_OR_RAISE(auto datum, FromProto(expr.literal()));
+      return compute::literal(std::move(datum));
+    }
+
+    default:
+      break;
+  }
+
+  return Status::NotImplemented("conversion to arrow::compute::Expression from 
",
+                                expr.DebugString());
+}
+
+Result<Datum> FromProto(const st::Expression::Literal& lit) {
+  switch (lit.literal_type_case()) {
+    case st::Expression::Literal::kBoolean:
+      return Datum(lit.boolean());
+
+    case st::Expression::Literal::kI8:
+      return Datum(static_cast<int8_t>(lit.i8()));
+    case st::Expression::Literal::kI16:
+      return Datum(static_cast<int16_t>(lit.i16()));
+    case st::Expression::Literal::kI32:
+      return Datum(static_cast<int32_t>(lit.i32()));
+    case st::Expression::Literal::kI64:
+      return Datum(static_cast<int64_t>(lit.i64()));
+
+    case st::Expression::Literal::kFp32:
+      return Datum(lit.fp32());
+    case st::Expression::Literal::kFp64:
+      return Datum(lit.fp64());
+
+    case st::Expression::Literal::kString:
+      return Datum(lit.string());
+    case st::Expression::Literal::kBinary:
+      return 
Datum(std::make_shared<BinaryScalar>(Buffer::FromString(lit.binary())));
+
+    case st::Expression::Literal::kTimestamp:
+      return Datum(std::make_shared<TimestampScalar>(
+          static_cast<int64_t>(lit.timestamp()), TimeUnit::MICRO));
+
+    case st::Expression::Literal::kTimestampTz:
+      return Datum(std::make_shared<TimestampScalar>(
+          static_cast<int64_t>(lit.timestamp_tz()), TimeUnit::MICRO,
+          TimestampTzTimezoneString()));
+
+    case st::Expression::Literal::kDate:
+      return 
Datum(std::make_shared<Date32Scalar>(static_cast<int32_t>(lit.date())));
+    case st::Expression::Literal::kTime:
+      return 
Datum(std::make_shared<Time64Scalar>(static_cast<int64_t>(lit.time()),
+                                                  TimeUnit::MICRO));
+
+    case st::Expression::Literal::kIntervalYearToMonth:
+    case st::Expression::Literal::kIntervalDayToSecond:
+      break;
+
+    case st::Expression::Literal::kUuid:
+      return Datum(std::make_shared<ExtensionScalar>(
+          FixedSizeBinaryScalarFromBytes(lit.uuid()), uuid()));
+
+    case st::Expression::Literal::kFixedChar:
+      return Datum(std::make_shared<ExtensionScalar>(
+          FixedSizeBinaryScalarFromBytes(lit.fixed_char()),
+          fixed_char(static_cast<int32_t>(lit.fixed_char().size()))));
+
+    case st::Expression::Literal::kVarChar:
+      // FIXME
+      // There's no way to determine VarChar.length from the literal
+      break;
+
+    case st::Expression::Literal::kFixedBinary:
+      return Datum(FixedSizeBinaryScalarFromBytes(lit.fixed_char()));
+
+    case st::Expression::Literal::kDecimal:
+      if (lit.decimal().size() != sizeof(Decimal128)) {
+        return Status::Invalid("Decimal literal had ", lit.decimal().size(),
+                               " bytes (expected ", sizeof(Decimal128), ")");
+      }
+
+      // FIXME
+      // It's not clear how these bytes should be interpreted...
+      // Furthermore, there's no way to determine scale or precision

Review comment:
       This is fixed in `main`

##########
File path: cpp/src/arrow/engine/substrait/serde.cc
##########
@@ -0,0 +1,109 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/engine/substrait/serde.h"
+
+#include "arrow/engine/substrait/expression_internal.h"
+#include "arrow/engine/substrait/type_internal.h"
+#include "arrow/util/string_view.h"
+#include "google/protobuf/io/zero_copy_stream_impl_lite.h"
+#include "google/protobuf/message.h"
+
+#include "generated/substrait/plan.pb.h"
+
+namespace google {
+namespace protobuf {
+
+class Message;
+
+}  // namespace protobuf
+}  // namespace google
+
+namespace arrow {
+namespace engine {
+
+Status ParseFromBufferImpl(const Buffer& buf, const std::string& full_name,
+                           google::protobuf::Message* message) {
+  google::protobuf::io::ArrayInputStream buf_stream{buf.data(),
+                                                    
static_cast<int>(buf.size())};
+
+  if (message->ParseFromZeroCopyStream(&buf_stream)) {
+    return Status::OK();
+  }
+  return Status::IOError("ParseFromZeroCopyStream failed for ", full_name);
+}
+
+template <typename Message>
+Result<Message> ParseFromBuffer(const Buffer& buf) {
+  Message message;
+  ARROW_RETURN_NOT_OK(
+      ParseFromBufferImpl(buf, Message::descriptor()->full_name(), &message));
+  return message;
+}
+
+Result<compute::Declaration> Convert(const st::Rel& relation) {
+  return Status::NotImplemented("");

Review comment:
       Ha, I'm keen to review this implementation, so far my experience is that 
it's generally more challenging :)

##########
File path: cpp/src/arrow/engine/substrait/type_internal.cc
##########
@@ -0,0 +1,434 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/engine/substrait/type_internal.h"
+
+#include <string>
+#include <vector>
+
+#include "arrow/buffer.h"
+#include "arrow/engine/substrait/extension_types.h"
+#include "arrow/result.h"
+#include "arrow/status.h"
+#include "arrow/type.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/make_unique.h"
+#include "arrow/visitor_inline.h"
+
+namespace arrow {
+namespace engine {
+namespace {
+
+template <typename TypeMessage>
+Status CheckVariation(const TypeMessage& type) {
+  if (!type.has_variation()) return Status::OK();
+  return Status::NotImplemented("Type.Variation for ", type.DebugString());
+}
+
+template <typename TypeMessage>
+bool IsNullable(const TypeMessage& type) {
+  return type.nullability() == st::Type_Nullability_NULLABLE;
+}
+
+template <typename ArrowType, typename TypeMessage, typename... A>
+Result<std::pair<std::shared_ptr<DataType>, bool>> FromProtoImpl(const 
TypeMessage& type,
+                                                                 A&&... args) {
+  RETURN_NOT_OK(CheckVariation(type));
+
+  return std::make_pair(std::static_pointer_cast<DataType>(
+                            
std::make_shared<ArrowType>(std::forward<A>(args)...)),
+                        IsNullable(type));
+}
+
+template <typename TypeMessage, typename... A>
+Result<std::pair<std::shared_ptr<DataType>, bool>> FromProtoImpl(
+    const TypeMessage& type, std::shared_ptr<DataType> type_factory(A...), 
A&&... args) {
+  RETURN_NOT_OK(CheckVariation(type));
+
+  return std::make_pair(
+      
std::static_pointer_cast<DataType>(type_factory(std::forward<A>(args)...)),
+      IsNullable(type));
+}
+
+template <typename Types, typename Names = const std::string*>
+Result<FieldVector> FieldsFromProto(
+    int size, const Types& types,
+    const Names* names = static_cast<const Names*>(nullptr)) {
+  FieldVector fields(size);
+  for (int i = 0; i < size; ++i) {
+    ARROW_ASSIGN_OR_RAISE(auto type_nullable, FromProto(types[i]));
+
+    std::string name = names ? std::move((*names)[i]) : "";
+    fields[i] =
+        field(std::move(name), std::move(type_nullable.first), 
type_nullable.second);
+  }
+  return fields;
+}
+
+}  // namespace
+
+Result<std::pair<std::shared_ptr<DataType>, bool>> FromProto(const st::Type& 
type) {
+  switch (type.kind_case()) {
+    case st::Type::kBool:
+      return FromProtoImpl<BooleanType>(type.bool_());
+
+    case st::Type::kI8:
+      return FromProtoImpl<Int8Type>(type.i8());
+    case st::Type::kI16:
+      return FromProtoImpl<Int16Type>(type.i16());
+    case st::Type::kI32:
+      return FromProtoImpl<Int32Type>(type.i32());
+    case st::Type::kI64:
+      return FromProtoImpl<Int64Type>(type.i64());
+
+    case st::Type::kFp32:
+      return FromProtoImpl<FloatType>(type.fp32());
+    case st::Type::kFp64:
+      return FromProtoImpl<DoubleType>(type.fp64());
+
+    case st::Type::kString:
+      return FromProtoImpl<StringType>(type.string());
+    case st::Type::kBinary:
+      return FromProtoImpl<BinaryType>(type.binary());
+
+    case st::Type::kTimestamp:
+      return FromProtoImpl<TimestampType>(type.timestamp(), TimeUnit::MICRO);
+    case st::Type::kTimestampTz:
+      return FromProtoImpl<TimestampType>(type.timestamp_tz(), TimeUnit::MICRO,
+                                          TimestampTzTimezoneString());
+    case st::Type::kDate:
+      // FIXME
+      // Substrait uses uint32_t to store dates, and further restricts the 
allowed
+      // range of dates to [1000-01-01..9999-12-31]. Does this mean the value 
should
+      // be interpreted as an offset from 1000-01-01 instead of the epoch? Or 
should
+      // the value be signed instead?
+      // Furthermore, simple_logical_types.md states that the equivalent arrow 
type
+      // is Date64 (which measures milliseconds rather than days). Is that 
incorrect?
+      return FromProtoImpl<Date32Type>(type.date());
+
+    case st::Type::kTime:
+      return FromProtoImpl<Time64Type>(type.time(), TimeUnit::MICRO);
+
+    case st::Type::kIntervalYear:
+      // FIXME
+      // None of MonthIntervalType, DayTimeIntervalType, 
MonthDayNanoIntervalType
+      // corresponds; none has a year field. Lossy conversion to 
MonthIntervalType
+      // would be possible...
+      break;
+
+    case st::Type::kIntervalDay:
+      // FIXME
+      // Documentation is inconsistent; the precision of the sub-day interval 
is
+      // described as microsecond in simple_logical_types.md but 
IntervalDayToSecond has
+      // the field `int32 seconds`. At microsecond precision it's minimally 
necessary to
+      // store all values in the range `[0,24*60*60*1000_000)` in order to 
express all
+      // possible sub-day intervals, but this is not possible for 32 bit 
integers.
+      //
+      // Possible fixes: amend that field to `int64 milliseconds`, then this 
type can be
+      //                 converted to MonthDayNanoIntervalType (months will 
always be
+      //                 0).
+      //               : amend documentation to claim only second precision, 
then this
+      //                 type can be converted to DayTimeIntervalType 
(milliseconds %
+      //                 1000 will always be 0).
+      break;
+
+    case st::Type::kUuid:
+      return FromProtoImpl(type.uuid(), uuid);
+
+    case st::Type::kFixedChar:
+      // need extension type to mark utf-8 constraint
+      return FromProtoImpl(type.fixed_char(), fixed_char, 
type.fixed_char().length());
+
+    case st::Type::kVarchar:
+      // need extension type to hold type.varchar().length() constraint

Review comment:
       Is this a `TODO`, or an explanation of something elsewhere?

##########
File path: cpp/src/arrow/engine/substrait/serde_test.cc
##########
@@ -0,0 +1,126 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/engine/substrait/serde.h"
+
+#include <gtest/gtest.h>
+
+#include "arrow/engine/substrait/extension_types.h"
+#include "arrow/testing/gtest_util.h"
+#include "arrow/testing/matchers.h"
+
+namespace arrow {
+namespace engine {
+
+TEST(Substrait, BasicTypeRoundTrip) {
+  for (auto type : {
+           boolean(),
+
+           int8(),
+           int16(),
+           int32(),
+           int64(),
+
+           float32(),
+           float64(),
+
+           date32(),
+           timestamp(TimeUnit::MICRO),
+           timestamp(TimeUnit::MICRO, "UTC"),
+           time64(TimeUnit::MICRO),
+
+           decimal128(27, 5),
+
+           struct_({
+               field("", int64()),
+               field("", list(utf8())),
+           }),
+
+           uuid(),
+           fixed_char(32),
+           varchar(1024),

Review comment:
       This formatting is really bizarre, but I trust `clang-format` ... I 
guess.

##########
File path: cpp/src/arrow/engine/substrait/type_internal.cc
##########
@@ -0,0 +1,434 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/engine/substrait/type_internal.h"
+
+#include <string>
+#include <vector>
+
+#include "arrow/buffer.h"
+#include "arrow/engine/substrait/extension_types.h"
+#include "arrow/result.h"
+#include "arrow/status.h"
+#include "arrow/type.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/make_unique.h"
+#include "arrow/visitor_inline.h"
+
+namespace arrow {
+namespace engine {
+namespace {
+
+template <typename TypeMessage>
+Status CheckVariation(const TypeMessage& type) {
+  if (!type.has_variation()) return Status::OK();
+  return Status::NotImplemented("Type.Variation for ", type.DebugString());
+}
+
+template <typename TypeMessage>
+bool IsNullable(const TypeMessage& type) {
+  return type.nullability() == st::Type_Nullability_NULLABLE;
+}
+
+template <typename ArrowType, typename TypeMessage, typename... A>
+Result<std::pair<std::shared_ptr<DataType>, bool>> FromProtoImpl(const 
TypeMessage& type,
+                                                                 A&&... args) {
+  RETURN_NOT_OK(CheckVariation(type));
+
+  return std::make_pair(std::static_pointer_cast<DataType>(
+                            
std::make_shared<ArrowType>(std::forward<A>(args)...)),
+                        IsNullable(type));
+}
+
+template <typename TypeMessage, typename... A>
+Result<std::pair<std::shared_ptr<DataType>, bool>> FromProtoImpl(
+    const TypeMessage& type, std::shared_ptr<DataType> type_factory(A...), 
A&&... args) {
+  RETURN_NOT_OK(CheckVariation(type));
+
+  return std::make_pair(
+      
std::static_pointer_cast<DataType>(type_factory(std::forward<A>(args)...)),
+      IsNullable(type));
+}
+
+template <typename Types, typename Names = const std::string*>
+Result<FieldVector> FieldsFromProto(
+    int size, const Types& types,
+    const Names* names = static_cast<const Names*>(nullptr)) {
+  FieldVector fields(size);
+  for (int i = 0; i < size; ++i) {
+    ARROW_ASSIGN_OR_RAISE(auto type_nullable, FromProto(types[i]));
+
+    std::string name = names ? std::move((*names)[i]) : "";
+    fields[i] =
+        field(std::move(name), std::move(type_nullable.first), 
type_nullable.second);
+  }
+  return fields;
+}
+
+}  // namespace
+
+Result<std::pair<std::shared_ptr<DataType>, bool>> FromProto(const st::Type& 
type) {
+  switch (type.kind_case()) {
+    case st::Type::kBool:
+      return FromProtoImpl<BooleanType>(type.bool_());
+
+    case st::Type::kI8:
+      return FromProtoImpl<Int8Type>(type.i8());
+    case st::Type::kI16:
+      return FromProtoImpl<Int16Type>(type.i16());
+    case st::Type::kI32:
+      return FromProtoImpl<Int32Type>(type.i32());
+    case st::Type::kI64:
+      return FromProtoImpl<Int64Type>(type.i64());
+
+    case st::Type::kFp32:
+      return FromProtoImpl<FloatType>(type.fp32());
+    case st::Type::kFp64:
+      return FromProtoImpl<DoubleType>(type.fp64());
+
+    case st::Type::kString:
+      return FromProtoImpl<StringType>(type.string());
+    case st::Type::kBinary:
+      return FromProtoImpl<BinaryType>(type.binary());
+
+    case st::Type::kTimestamp:
+      return FromProtoImpl<TimestampType>(type.timestamp(), TimeUnit::MICRO);
+    case st::Type::kTimestampTz:
+      return FromProtoImpl<TimestampType>(type.timestamp_tz(), TimeUnit::MICRO,
+                                          TimestampTzTimezoneString());
+    case st::Type::kDate:
+      // FIXME
+      // Substrait uses uint32_t to store dates, and further restricts the 
allowed
+      // range of dates to [1000-01-01..9999-12-31]. Does this mean the value 
should
+      // be interpreted as an offset from 1000-01-01 instead of the epoch? Or 
should
+      // the value be signed instead?
+      // Furthermore, simple_logical_types.md states that the equivalent arrow 
type
+      // is Date64 (which measures milliseconds rather than days). Is that 
incorrect?
+      return FromProtoImpl<Date32Type>(type.date());
+
+    case st::Type::kTime:
+      return FromProtoImpl<Time64Type>(type.time(), TimeUnit::MICRO);
+
+    case st::Type::kIntervalYear:
+      // FIXME
+      // None of MonthIntervalType, DayTimeIntervalType, 
MonthDayNanoIntervalType
+      // corresponds; none has a year field. Lossy conversion to 
MonthIntervalType
+      // would be possible...
+      break;
+
+    case st::Type::kIntervalDay:
+      // FIXME
+      // Documentation is inconsistent; the precision of the sub-day interval 
is
+      // described as microsecond in simple_logical_types.md but 
IntervalDayToSecond has
+      // the field `int32 seconds`. At microsecond precision it's minimally 
necessary to
+      // store all values in the range `[0,24*60*60*1000_000)` in order to 
express all
+      // possible sub-day intervals, but this is not possible for 32 bit 
integers.
+      //
+      // Possible fixes: amend that field to `int64 milliseconds`, then this 
type can be
+      //                 converted to MonthDayNanoIntervalType (months will 
always be
+      //                 0).
+      //               : amend documentation to claim only second precision, 
then this
+      //                 type can be converted to DayTimeIntervalType 
(milliseconds %
+      //                 1000 will always be 0).
+      break;
+
+    case st::Type::kUuid:
+      return FromProtoImpl(type.uuid(), uuid);
+
+    case st::Type::kFixedChar:
+      // need extension type to mark utf-8 constraint
+      return FromProtoImpl(type.fixed_char(), fixed_char, 
type.fixed_char().length());
+
+    case st::Type::kVarchar:
+      // need extension type to hold type.varchar().length() constraint
+      return FromProtoImpl(type.varchar(), varchar, type.varchar().length());
+
+    case st::Type::kFixedBinary:
+      return FromProtoImpl<FixedSizeBinaryType>(type.fixed_binary(),
+                                                type.fixed_binary().length());
+
+    case st::Type::kDecimal: {
+      const auto& decimal = type.decimal();
+      return FromProtoImpl<Decimal128Type>(decimal, decimal.precision(), 
decimal.scale());
+    }
+
+    case st::Type::kStruct: {
+      const auto& struct_ = type.struct_();
+
+      ARROW_ASSIGN_OR_RAISE(auto fields,
+                            FieldsFromProto(struct_.types_size(), 
struct_.types()));
+
+      return FromProtoImpl<StructType>(struct_, std::move(fields));
+    }
+
+      // NOTE: NamedStruct is not enumerated in KindCase.

Review comment:
       That is intentional.

##########
File path: cpp/src/arrow/engine/substrait/type_internal.cc
##########
@@ -0,0 +1,434 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/engine/substrait/type_internal.h"
+
+#include <string>
+#include <vector>
+
+#include "arrow/buffer.h"
+#include "arrow/engine/substrait/extension_types.h"
+#include "arrow/result.h"
+#include "arrow/status.h"
+#include "arrow/type.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/make_unique.h"
+#include "arrow/visitor_inline.h"
+
+namespace arrow {
+namespace engine {
+namespace {
+
+template <typename TypeMessage>
+Status CheckVariation(const TypeMessage& type) {
+  if (!type.has_variation()) return Status::OK();
+  return Status::NotImplemented("Type.Variation for ", type.DebugString());
+}
+
+template <typename TypeMessage>
+bool IsNullable(const TypeMessage& type) {
+  return type.nullability() == st::Type_Nullability_NULLABLE;
+}
+
+template <typename ArrowType, typename TypeMessage, typename... A>
+Result<std::pair<std::shared_ptr<DataType>, bool>> FromProtoImpl(const 
TypeMessage& type,
+                                                                 A&&... args) {
+  RETURN_NOT_OK(CheckVariation(type));
+
+  return std::make_pair(std::static_pointer_cast<DataType>(
+                            
std::make_shared<ArrowType>(std::forward<A>(args)...)),
+                        IsNullable(type));
+}
+
+template <typename TypeMessage, typename... A>
+Result<std::pair<std::shared_ptr<DataType>, bool>> FromProtoImpl(
+    const TypeMessage& type, std::shared_ptr<DataType> type_factory(A...), 
A&&... args) {
+  RETURN_NOT_OK(CheckVariation(type));
+
+  return std::make_pair(
+      
std::static_pointer_cast<DataType>(type_factory(std::forward<A>(args)...)),
+      IsNullable(type));
+}
+
+template <typename Types, typename Names = const std::string*>
+Result<FieldVector> FieldsFromProto(
+    int size, const Types& types,
+    const Names* names = static_cast<const Names*>(nullptr)) {
+  FieldVector fields(size);
+  for (int i = 0; i < size; ++i) {
+    ARROW_ASSIGN_OR_RAISE(auto type_nullable, FromProto(types[i]));
+
+    std::string name = names ? std::move((*names)[i]) : "";
+    fields[i] =
+        field(std::move(name), std::move(type_nullable.first), 
type_nullable.second);
+  }
+  return fields;
+}
+
+}  // namespace
+
+Result<std::pair<std::shared_ptr<DataType>, bool>> FromProto(const st::Type& 
type) {
+  switch (type.kind_case()) {
+    case st::Type::kBool:
+      return FromProtoImpl<BooleanType>(type.bool_());
+
+    case st::Type::kI8:
+      return FromProtoImpl<Int8Type>(type.i8());
+    case st::Type::kI16:
+      return FromProtoImpl<Int16Type>(type.i16());
+    case st::Type::kI32:
+      return FromProtoImpl<Int32Type>(type.i32());
+    case st::Type::kI64:
+      return FromProtoImpl<Int64Type>(type.i64());
+
+    case st::Type::kFp32:
+      return FromProtoImpl<FloatType>(type.fp32());
+    case st::Type::kFp64:
+      return FromProtoImpl<DoubleType>(type.fp64());
+
+    case st::Type::kString:
+      return FromProtoImpl<StringType>(type.string());
+    case st::Type::kBinary:
+      return FromProtoImpl<BinaryType>(type.binary());
+
+    case st::Type::kTimestamp:
+      return FromProtoImpl<TimestampType>(type.timestamp(), TimeUnit::MICRO);
+    case st::Type::kTimestampTz:
+      return FromProtoImpl<TimestampType>(type.timestamp_tz(), TimeUnit::MICRO,
+                                          TimestampTzTimezoneString());
+    case st::Type::kDate:
+      // FIXME
+      // Substrait uses uint32_t to store dates, and further restricts the 
allowed
+      // range of dates to [1000-01-01..9999-12-31]. Does this mean the value 
should
+      // be interpreted as an offset from 1000-01-01 instead of the epoch? Or 
should
+      // the value be signed instead?
+      // Furthermore, simple_logical_types.md states that the equivalent arrow 
type
+      // is Date64 (which measures milliseconds rather than days). Is that 
incorrect?

Review comment:
       I'll say though that the specific unit is only specified for literals

##########
File path: cpp/src/arrow/engine/CMakeLists.txt
##########
@@ -0,0 +1,124 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+add_custom_target(arrow_engine)
+
+arrow_install_all_headers("arrow/engine")
+
+set(ARROW_ENGINE_LINK_LIBS ${ARROW_PROTOBUF_LIBPROTOBUF})
+
+#if(WIN32)
+#  list(APPEND ARROW_ENGINE_LINK_LIBS ws2_32.lib)
+#endif()
+
+set(ARROW_ENGINE_SRCS
+    substrait/expression_internal.cc
+    substrait/extension_types.cc
+    substrait/serde.cc
+    substrait/type_internal.cc)
+
+set(SUBSTRAIT_DIR "${CMAKE_CURRENT_BINARY_DIR}/substrait")
+set(SUBSTRAIT_GEN_DIR "${ARROW_SOURCE_DIR}/src/generated/substrait")
+set(SUBSTRAIT_PROTOS expression
+                     extensions
+                     function
+                     parameterized_types
+                     plan
+                     relations
+                     selection
+                     type
+                     type_expressions)
+
+string(FIND "${ARROW_SUBSTRAIT_REPO_AND_TAG}" " " TAG_START)
+if(TAG_START EQUAL -1)
+  message(FATAL_ERROR "Cannot parse 
ARROW_SUBSTRAIT_REPO_AND_TAG='${ARROW_SUBSTRAIT_REPO_AND_TAG}'")
+endif()
+string(SUBSTRING "${ARROW_SUBSTRAIT_REPO_AND_TAG}" 0 ${TAG_START} 
ARROW_SUBSTRAIT_REPO)
+string(SUBSTRING "${ARROW_SUBSTRAIT_REPO_AND_TAG}" ${TAG_START} -1 
ARROW_SUBSTRAIT_TAG)
+string(STRIP "${ARROW_SUBSTRAIT_TAG}" ARROW_SUBSTRAIT_TAG)
+
+externalproject_add(substrait_ep
+                    GIT_REPOSITORY "${ARROW_SUBSTRAIT_REPO}"
+                    GIT_TAG "${ARROW_SUBSTRAIT_TAG}"
+                    SOURCE_DIR "${SUBSTRAIT_DIR}"
+                    CONFIGURE_COMMAND ""
+                    BUILD_COMMAND ""
+                    INSTALL_COMMAND "")
+
+set(SUBSTRAIT_PROTO_GEN_ALL)
+foreach(SUBSTRAIT_PROTO ${SUBSTRAIT_PROTOS})
+  set(SUBSTRAIT_PROTO_GEN "${SUBSTRAIT_GEN_DIR}/${SUBSTRAIT_PROTO}.pb.cc")
+
+  set_source_files_properties(${SUBSTRAIT_PROTO_GEN} PROPERTIES GENERATED TRUE)
+  add_custom_command(OUTPUT ${SUBSTRAIT_PROTO_GEN}
+                     COMMAND ${ARROW_PROTOBUF_PROTOC}
+                             "-I${SUBSTRAIT_DIR}/binary"
+                             "--cpp_out=${SUBSTRAIT_GEN_DIR}"
+                             "${SUBSTRAIT_PROTO}.proto"
+                     DEPENDS ${PROTO_DEPENDS} substrait_ep)
+
+  list(APPEND ARROW_ENGINE_SRCS ${SUBSTRAIT_PROTO_GEN})
+  list(APPEND SUBSTRAIT_PROTO_GEN_ALL ${SUBSTRAIT_PROTO_GEN})
+endforeach()
+
+add_custom_target(substrait_gen ALL DEPENDS ${SUBSTRAIT_PROTO_GEN_ALL})
+
+find_package(Git)
+add_custom_target(substrait_gen_verify
+                  COMMAND ${GIT_EXECUTABLE} diff --quiet ${SUBSTRAIT_GEN_DIR}

Review comment:
       should this have an `--exit-code` flag as well?

##########
File path: cpp/src/arrow/scalar.h
##########
@@ -344,8 +344,8 @@ struct ARROW_EXPORT TimestampScalar : public 
TemporalScalar<TimestampType> {
   using TemporalScalar<TimestampType>::TemporalScalar;
 
   TimestampScalar(typename TemporalScalar<TimestampType>::ValueType value,
-                  TimeUnit::type unit)
-      : TimestampScalar(std::move(value), timestamp(unit)) {}
+                  TimeUnit::type unit, std::string tz = "")
+      : TimestampScalar(std::move(value), timestamp(unit, std::move(tz))) {}

Review comment:
       Fun. This in theory allows a user to construct a timestamp scalar values 
each with a different time zone :grimacing:.

##########
File path: cpp/src/arrow/engine/substrait/expression_internal.cc
##########
@@ -0,0 +1,427 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// This API is EXPERIMENTAL.
+
+#include "arrow/engine/substrait/expression_internal.h"
+
+#include <utility>
+
+#include "arrow/builder.h"
+#include "arrow/compute/exec/expression.h"
+#include "arrow/engine/substrait/extension_types.h"
+#include "arrow/engine/substrait/type_internal.h"
+#include "arrow/engine/visibility.h"
+#include "arrow/result.h"
+#include "arrow/status.h"
+
+#include "arrow/util/make_unique.h"
+#include "generated/substrait/expression.pb.h"  // IWYU pragma: export
+
+namespace st = io::substrait;
+
+namespace arrow {
+namespace engine {
+namespace {
+
+std::shared_ptr<FixedSizeBinaryScalar> FixedSizeBinaryScalarFromBytes(
+    const std::string& bytes) {
+  auto buf = Buffer::FromString(bytes);
+  auto type = fixed_size_binary(static_cast<int>(buf->size()));
+  return std::make_shared<FixedSizeBinaryScalar>(std::move(buf), 
std::move(type));
+}
+
+}  // namespace
+
+Result<compute::Expression> FromProto(const st::Expression& expr) {
+  switch (expr.rex_type_case()) {
+    case st::Expression::kLiteral: {
+      ARROW_ASSIGN_OR_RAISE(auto datum, FromProto(expr.literal()));
+      return compute::literal(std::move(datum));
+    }
+
+    default:
+      break;
+  }
+
+  return Status::NotImplemented("conversion to arrow::compute::Expression from 
",
+                                expr.DebugString());
+}
+
+Result<Datum> FromProto(const st::Expression::Literal& lit) {
+  switch (lit.literal_type_case()) {
+    case st::Expression::Literal::kBoolean:
+      return Datum(lit.boolean());
+
+    case st::Expression::Literal::kI8:
+      return Datum(static_cast<int8_t>(lit.i8()));
+    case st::Expression::Literal::kI16:
+      return Datum(static_cast<int16_t>(lit.i16()));
+    case st::Expression::Literal::kI32:
+      return Datum(static_cast<int32_t>(lit.i32()));
+    case st::Expression::Literal::kI64:
+      return Datum(static_cast<int64_t>(lit.i64()));
+
+    case st::Expression::Literal::kFp32:
+      return Datum(lit.fp32());
+    case st::Expression::Literal::kFp64:
+      return Datum(lit.fp64());
+
+    case st::Expression::Literal::kString:
+      return Datum(lit.string());
+    case st::Expression::Literal::kBinary:
+      return 
Datum(std::make_shared<BinaryScalar>(Buffer::FromString(lit.binary())));
+
+    case st::Expression::Literal::kTimestamp:
+      return Datum(std::make_shared<TimestampScalar>(
+          static_cast<int64_t>(lit.timestamp()), TimeUnit::MICRO));
+
+    case st::Expression::Literal::kTimestampTz:
+      return Datum(std::make_shared<TimestampScalar>(
+          static_cast<int64_t>(lit.timestamp_tz()), TimeUnit::MICRO,
+          TimestampTzTimezoneString()));
+
+    case st::Expression::Literal::kDate:
+      return 
Datum(std::make_shared<Date32Scalar>(static_cast<int32_t>(lit.date())));
+    case st::Expression::Literal::kTime:
+      return 
Datum(std::make_shared<Time64Scalar>(static_cast<int64_t>(lit.time()),
+                                                  TimeUnit::MICRO));
+
+    case st::Expression::Literal::kIntervalYearToMonth:
+    case st::Expression::Literal::kIntervalDayToSecond:
+      break;
+
+    case st::Expression::Literal::kUuid:
+      return Datum(std::make_shared<ExtensionScalar>(
+          FixedSizeBinaryScalarFromBytes(lit.uuid()), uuid()));
+
+    case st::Expression::Literal::kFixedChar:
+      return Datum(std::make_shared<ExtensionScalar>(
+          FixedSizeBinaryScalarFromBytes(lit.fixed_char()),

Review comment:
       Is fixed size binary the right type here? Does arrow have a fixed length 
string (part of me hopes is does not).

##########
File path: cpp/src/arrow/engine/substrait/expression_internal.cc
##########
@@ -0,0 +1,427 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// This API is EXPERIMENTAL.
+
+#include "arrow/engine/substrait/expression_internal.h"
+
+#include <utility>
+
+#include "arrow/builder.h"
+#include "arrow/compute/exec/expression.h"
+#include "arrow/engine/substrait/extension_types.h"
+#include "arrow/engine/substrait/type_internal.h"
+#include "arrow/engine/visibility.h"
+#include "arrow/result.h"
+#include "arrow/status.h"
+
+#include "arrow/util/make_unique.h"
+#include "generated/substrait/expression.pb.h"  // IWYU pragma: export
+
+namespace st = io::substrait;
+
+namespace arrow {
+namespace engine {
+namespace {
+
+std::shared_ptr<FixedSizeBinaryScalar> FixedSizeBinaryScalarFromBytes(
+    const std::string& bytes) {
+  auto buf = Buffer::FromString(bytes);
+  auto type = fixed_size_binary(static_cast<int>(buf->size()));
+  return std::make_shared<FixedSizeBinaryScalar>(std::move(buf), 
std::move(type));
+}
+
+}  // namespace
+
+Result<compute::Expression> FromProto(const st::Expression& expr) {
+  switch (expr.rex_type_case()) {
+    case st::Expression::kLiteral: {
+      ARROW_ASSIGN_OR_RAISE(auto datum, FromProto(expr.literal()));
+      return compute::literal(std::move(datum));
+    }
+
+    default:
+      break;
+  }
+
+  return Status::NotImplemented("conversion to arrow::compute::Expression from 
",
+                                expr.DebugString());
+}
+
+Result<Datum> FromProto(const st::Expression::Literal& lit) {
+  switch (lit.literal_type_case()) {
+    case st::Expression::Literal::kBoolean:
+      return Datum(lit.boolean());
+
+    case st::Expression::Literal::kI8:
+      return Datum(static_cast<int8_t>(lit.i8()));
+    case st::Expression::Literal::kI16:
+      return Datum(static_cast<int16_t>(lit.i16()));
+    case st::Expression::Literal::kI32:
+      return Datum(static_cast<int32_t>(lit.i32()));
+    case st::Expression::Literal::kI64:
+      return Datum(static_cast<int64_t>(lit.i64()));
+
+    case st::Expression::Literal::kFp32:
+      return Datum(lit.fp32());
+    case st::Expression::Literal::kFp64:
+      return Datum(lit.fp64());
+
+    case st::Expression::Literal::kString:
+      return Datum(lit.string());
+    case st::Expression::Literal::kBinary:
+      return 
Datum(std::make_shared<BinaryScalar>(Buffer::FromString(lit.binary())));
+
+    case st::Expression::Literal::kTimestamp:
+      return Datum(std::make_shared<TimestampScalar>(
+          static_cast<int64_t>(lit.timestamp()), TimeUnit::MICRO));
+
+    case st::Expression::Literal::kTimestampTz:
+      return Datum(std::make_shared<TimestampScalar>(
+          static_cast<int64_t>(lit.timestamp_tz()), TimeUnit::MICRO,
+          TimestampTzTimezoneString()));
+
+    case st::Expression::Literal::kDate:
+      return 
Datum(std::make_shared<Date32Scalar>(static_cast<int32_t>(lit.date())));
+    case st::Expression::Literal::kTime:
+      return 
Datum(std::make_shared<Time64Scalar>(static_cast<int64_t>(lit.time()),
+                                                  TimeUnit::MICRO));
+
+    case st::Expression::Literal::kIntervalYearToMonth:
+    case st::Expression::Literal::kIntervalDayToSecond:
+      break;
+
+    case st::Expression::Literal::kUuid:
+      return Datum(std::make_shared<ExtensionScalar>(
+          FixedSizeBinaryScalarFromBytes(lit.uuid()), uuid()));
+
+    case st::Expression::Literal::kFixedChar:
+      return Datum(std::make_shared<ExtensionScalar>(
+          FixedSizeBinaryScalarFromBytes(lit.fixed_char()),
+          fixed_char(static_cast<int32_t>(lit.fixed_char().size()))));
+
+    case st::Expression::Literal::kVarChar:
+      // FIXME
+      // There's no way to determine VarChar.length from the literal
+      break;
+
+    case st::Expression::Literal::kFixedBinary:
+      return Datum(FixedSizeBinaryScalarFromBytes(lit.fixed_char()));
+
+    case st::Expression::Literal::kDecimal:
+      if (lit.decimal().size() != sizeof(Decimal128)) {
+        return Status::Invalid("Decimal literal had ", lit.decimal().size(),
+                               " bytes (expected ", sizeof(Decimal128), ")");
+      }
+
+      // FIXME
+      // It's not clear how these bytes should be interpreted...
+      // Furthermore, there's no way to determine scale or precision
+      break;
+
+    case st::Expression::Literal::kStruct: {
+      const auto& struct_ = lit.struct_();
+
+      ScalarVector fields(struct_.fields_size());
+      std::vector<std::string> field_names(fields.size(), "");
+      for (size_t i = 0; i < fields.size(); ++i) {
+        ARROW_ASSIGN_OR_RAISE(auto field, FromProto(struct_.fields(i)));
+        DCHECK(field.is_scalar());
+        fields.push_back(field.scalar());
+      }
+      ARROW_ASSIGN_OR_RAISE(
+          auto scalar, StructScalar::Make(std::move(fields), 
std::move(field_names)));
+      return Datum(std::move(scalar));
+    }
+
+      // case st::Expression::Literal::kNamedStruct:
+
+    case st::Expression::Literal::kList: {
+      const auto& list = lit.list();
+
+      // FIXME
+      // No way to determine list value type for empty list literals

Review comment:
       I've been using the ever-annoying `null` type for this, because as you 
say the type can't be known without values.

##########
File path: cpp/src/arrow/engine/simple_extension_type_internal.h
##########
@@ -0,0 +1,181 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <memory>
+#include <sstream>
+#include <string>
+#include <vector>
+
+#include "arrow/extension_type.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/optional.h"
+#include "arrow/util/reflection_internal.h"
+#include "arrow/util/string.h"
+
+namespace arrow {
+namespace engine {
+
+template <const util::string_view& kExtensionName, typename Params,
+          typename ParamsProperties, const ParamsProperties& kProperties,
+          std::shared_ptr<DataType> GetStorage(const Params&)>
+class SimpleExtensionType : public ExtensionType {

Review comment:
       This is an excellent use of extension types IMO and a good example of 
two systems working together with their respective extension mechanisms.

##########
File path: cpp/src/arrow/engine/substrait/serde.cc
##########
@@ -0,0 +1,109 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/engine/substrait/serde.h"
+
+#include "arrow/engine/substrait/expression_internal.h"
+#include "arrow/engine/substrait/type_internal.h"
+#include "arrow/util/string_view.h"
+#include "google/protobuf/io/zero_copy_stream_impl_lite.h"
+#include "google/protobuf/message.h"
+
+#include "generated/substrait/plan.pb.h"
+
+namespace google {
+namespace protobuf {
+
+class Message;
+
+}  // namespace protobuf
+}  // namespace google
+
+namespace arrow {
+namespace engine {
+
+Status ParseFromBufferImpl(const Buffer& buf, const std::string& full_name,
+                           google::protobuf::Message* message) {
+  google::protobuf::io::ArrayInputStream buf_stream{buf.data(),
+                                                    
static_cast<int>(buf.size())};
+
+  if (message->ParseFromZeroCopyStream(&buf_stream)) {
+    return Status::OK();
+  }
+  return Status::IOError("ParseFromZeroCopyStream failed for ", full_name);
+}
+
+template <typename Message>
+Result<Message> ParseFromBuffer(const Buffer& buf) {
+  Message message;
+  ARROW_RETURN_NOT_OK(
+      ParseFromBufferImpl(buf, Message::descriptor()->full_name(), &message));
+  return message;
+}
+
+Result<compute::Declaration> Convert(const st::Rel& relation) {
+  return Status::NotImplemented("");
+}
+
+Result<std::vector<compute::Declaration>> ConvertPlan(const Buffer& buf) {
+  ARROW_ASSIGN_OR_RAISE(auto plan, ParseFromBuffer<st::Plan>(buf));
+
+  std::vector<compute::Declaration> decls;
+  for (const auto& relation : plan.relations()) {
+    ARROW_ASSIGN_OR_RAISE(auto decl, Convert(relation));
+    decls.push_back(std::move(decl));
+  }
+
+  return decls;

Review comment:
       Will this automatically be moved?

##########
File path: cpp/src/arrow/engine/substrait/serde_test.cc
##########
@@ -0,0 +1,126 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/engine/substrait/serde.h"
+
+#include <gtest/gtest.h>
+
+#include "arrow/engine/substrait/extension_types.h"
+#include "arrow/testing/gtest_util.h"
+#include "arrow/testing/matchers.h"
+
+namespace arrow {
+namespace engine {
+
+TEST(Substrait, BasicTypeRoundTrip) {
+  for (auto type : {
+           boolean(),
+
+           int8(),
+           int16(),
+           int32(),
+           int64(),
+
+           float32(),
+           float64(),
+
+           date32(),
+           timestamp(TimeUnit::MICRO),
+           timestamp(TimeUnit::MICRO, "UTC"),
+           time64(TimeUnit::MICRO),
+
+           decimal128(27, 5),
+
+           struct_({
+               field("", int64()),
+               field("", list(utf8())),

Review comment:
       It _could_ be useful to give the fields their string index name, like 
`"0"`, `"1"`, etc.

##########
File path: cpp/src/arrow/engine/substrait/type_internal.cc
##########
@@ -0,0 +1,434 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/engine/substrait/type_internal.h"
+
+#include <string>
+#include <vector>
+
+#include "arrow/buffer.h"
+#include "arrow/engine/substrait/extension_types.h"
+#include "arrow/result.h"
+#include "arrow/status.h"
+#include "arrow/type.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/make_unique.h"
+#include "arrow/visitor_inline.h"
+
+namespace arrow {
+namespace engine {
+namespace {
+
+template <typename TypeMessage>
+Status CheckVariation(const TypeMessage& type) {
+  if (!type.has_variation()) return Status::OK();
+  return Status::NotImplemented("Type.Variation for ", type.DebugString());
+}
+
+template <typename TypeMessage>
+bool IsNullable(const TypeMessage& type) {
+  return type.nullability() == st::Type_Nullability_NULLABLE;
+}
+
+template <typename ArrowType, typename TypeMessage, typename... A>
+Result<std::pair<std::shared_ptr<DataType>, bool>> FromProtoImpl(const 
TypeMessage& type,
+                                                                 A&&... args) {
+  RETURN_NOT_OK(CheckVariation(type));
+
+  return std::make_pair(std::static_pointer_cast<DataType>(
+                            
std::make_shared<ArrowType>(std::forward<A>(args)...)),
+                        IsNullable(type));
+}
+
+template <typename TypeMessage, typename... A>
+Result<std::pair<std::shared_ptr<DataType>, bool>> FromProtoImpl(
+    const TypeMessage& type, std::shared_ptr<DataType> type_factory(A...), 
A&&... args) {
+  RETURN_NOT_OK(CheckVariation(type));
+
+  return std::make_pair(
+      
std::static_pointer_cast<DataType>(type_factory(std::forward<A>(args)...)),
+      IsNullable(type));
+}
+
+template <typename Types, typename Names = const std::string*>
+Result<FieldVector> FieldsFromProto(
+    int size, const Types& types,
+    const Names* names = static_cast<const Names*>(nullptr)) {
+  FieldVector fields(size);
+  for (int i = 0; i < size; ++i) {
+    ARROW_ASSIGN_OR_RAISE(auto type_nullable, FromProto(types[i]));
+
+    std::string name = names ? std::move((*names)[i]) : "";

Review comment:
       Moving an element of an array here seems like it would lead to tearing 
one's hair out trying to debug, but maybe this is sanctioned C++?

##########
File path: cpp/src/arrow/engine/substrait/expression_internal.cc
##########
@@ -0,0 +1,427 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// This API is EXPERIMENTAL.
+
+#include "arrow/engine/substrait/expression_internal.h"
+
+#include <utility>
+
+#include "arrow/builder.h"
+#include "arrow/compute/exec/expression.h"
+#include "arrow/engine/substrait/extension_types.h"
+#include "arrow/engine/substrait/type_internal.h"
+#include "arrow/engine/visibility.h"
+#include "arrow/result.h"
+#include "arrow/status.h"
+
+#include "arrow/util/make_unique.h"
+#include "generated/substrait/expression.pb.h"  // IWYU pragma: export
+
+namespace st = io::substrait;
+
+namespace arrow {
+namespace engine {
+namespace {
+
+std::shared_ptr<FixedSizeBinaryScalar> FixedSizeBinaryScalarFromBytes(
+    const std::string& bytes) {
+  auto buf = Buffer::FromString(bytes);
+  auto type = fixed_size_binary(static_cast<int>(buf->size()));
+  return std::make_shared<FixedSizeBinaryScalar>(std::move(buf), 
std::move(type));
+}
+
+}  // namespace
+
+Result<compute::Expression> FromProto(const st::Expression& expr) {
+  switch (expr.rex_type_case()) {
+    case st::Expression::kLiteral: {
+      ARROW_ASSIGN_OR_RAISE(auto datum, FromProto(expr.literal()));
+      return compute::literal(std::move(datum));
+    }
+
+    default:
+      break;
+  }
+
+  return Status::NotImplemented("conversion to arrow::compute::Expression from 
",
+                                expr.DebugString());
+}
+
+Result<Datum> FromProto(const st::Expression::Literal& lit) {
+  switch (lit.literal_type_case()) {
+    case st::Expression::Literal::kBoolean:
+      return Datum(lit.boolean());
+
+    case st::Expression::Literal::kI8:
+      return Datum(static_cast<int8_t>(lit.i8()));
+    case st::Expression::Literal::kI16:
+      return Datum(static_cast<int16_t>(lit.i16()));
+    case st::Expression::Literal::kI32:
+      return Datum(static_cast<int32_t>(lit.i32()));
+    case st::Expression::Literal::kI64:
+      return Datum(static_cast<int64_t>(lit.i64()));
+
+    case st::Expression::Literal::kFp32:
+      return Datum(lit.fp32());
+    case st::Expression::Literal::kFp64:
+      return Datum(lit.fp64());
+
+    case st::Expression::Literal::kString:
+      return Datum(lit.string());
+    case st::Expression::Literal::kBinary:
+      return 
Datum(std::make_shared<BinaryScalar>(Buffer::FromString(lit.binary())));
+
+    case st::Expression::Literal::kTimestamp:
+      return Datum(std::make_shared<TimestampScalar>(
+          static_cast<int64_t>(lit.timestamp()), TimeUnit::MICRO));
+
+    case st::Expression::Literal::kTimestampTz:
+      return Datum(std::make_shared<TimestampScalar>(
+          static_cast<int64_t>(lit.timestamp_tz()), TimeUnit::MICRO,
+          TimestampTzTimezoneString()));
+
+    case st::Expression::Literal::kDate:
+      return 
Datum(std::make_shared<Date32Scalar>(static_cast<int32_t>(lit.date())));
+    case st::Expression::Literal::kTime:
+      return 
Datum(std::make_shared<Time64Scalar>(static_cast<int64_t>(lit.time()),
+                                                  TimeUnit::MICRO));
+
+    case st::Expression::Literal::kIntervalYearToMonth:
+    case st::Expression::Literal::kIntervalDayToSecond:
+      break;
+
+    case st::Expression::Literal::kUuid:
+      return Datum(std::make_shared<ExtensionScalar>(
+          FixedSizeBinaryScalarFromBytes(lit.uuid()), uuid()));
+
+    case st::Expression::Literal::kFixedChar:
+      return Datum(std::make_shared<ExtensionScalar>(
+          FixedSizeBinaryScalarFromBytes(lit.fixed_char()),
+          fixed_char(static_cast<int32_t>(lit.fixed_char().size()))));
+
+    case st::Expression::Literal::kVarChar:
+      // FIXME
+      // There's no way to determine VarChar.length from the literal
+      break;
+
+    case st::Expression::Literal::kFixedBinary:
+      return Datum(FixedSizeBinaryScalarFromBytes(lit.fixed_char()));
+
+    case st::Expression::Literal::kDecimal:
+      if (lit.decimal().size() != sizeof(Decimal128)) {
+        return Status::Invalid("Decimal literal had ", lit.decimal().size(),
+                               " bytes (expected ", sizeof(Decimal128), ")");
+      }
+
+      // FIXME
+      // It's not clear how these bytes should be interpreted...
+      // Furthermore, there's no way to determine scale or precision
+      break;
+
+    case st::Expression::Literal::kStruct: {
+      const auto& struct_ = lit.struct_();
+
+      ScalarVector fields(struct_.fields_size());
+      std::vector<std::string> field_names(fields.size(), "");
+      for (size_t i = 0; i < fields.size(); ++i) {
+        ARROW_ASSIGN_OR_RAISE(auto field, FromProto(struct_.fields(i)));
+        DCHECK(field.is_scalar());
+        fields.push_back(field.scalar());
+      }
+      ARROW_ASSIGN_OR_RAISE(
+          auto scalar, StructScalar::Make(std::move(fields), 
std::move(field_names)));
+      return Datum(std::move(scalar));
+    }
+
+      // case st::Expression::Literal::kNamedStruct:

Review comment:
       This comment can be removed, because `NamedStruct` isn't part of the 
type union. It's only used in the `base_schema` field of `ReadRel`.

##########
File path: cpp/src/arrow/engine/substrait/expression_internal.cc
##########
@@ -0,0 +1,427 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// This API is EXPERIMENTAL.
+
+#include "arrow/engine/substrait/expression_internal.h"
+
+#include <utility>
+
+#include "arrow/builder.h"
+#include "arrow/compute/exec/expression.h"
+#include "arrow/engine/substrait/extension_types.h"
+#include "arrow/engine/substrait/type_internal.h"
+#include "arrow/engine/visibility.h"
+#include "arrow/result.h"
+#include "arrow/status.h"
+
+#include "arrow/util/make_unique.h"
+#include "generated/substrait/expression.pb.h"  // IWYU pragma: export
+
+namespace st = io::substrait;
+
+namespace arrow {
+namespace engine {
+namespace {
+
+std::shared_ptr<FixedSizeBinaryScalar> FixedSizeBinaryScalarFromBytes(
+    const std::string& bytes) {
+  auto buf = Buffer::FromString(bytes);
+  auto type = fixed_size_binary(static_cast<int>(buf->size()));
+  return std::make_shared<FixedSizeBinaryScalar>(std::move(buf), 
std::move(type));
+}
+
+}  // namespace
+
+Result<compute::Expression> FromProto(const st::Expression& expr) {
+  switch (expr.rex_type_case()) {
+    case st::Expression::kLiteral: {
+      ARROW_ASSIGN_OR_RAISE(auto datum, FromProto(expr.literal()));
+      return compute::literal(std::move(datum));
+    }
+
+    default:
+      break;
+  }
+
+  return Status::NotImplemented("conversion to arrow::compute::Expression from 
",
+                                expr.DebugString());
+}
+
+Result<Datum> FromProto(const st::Expression::Literal& lit) {
+  switch (lit.literal_type_case()) {
+    case st::Expression::Literal::kBoolean:
+      return Datum(lit.boolean());
+
+    case st::Expression::Literal::kI8:
+      return Datum(static_cast<int8_t>(lit.i8()));
+    case st::Expression::Literal::kI16:
+      return Datum(static_cast<int16_t>(lit.i16()));
+    case st::Expression::Literal::kI32:
+      return Datum(static_cast<int32_t>(lit.i32()));
+    case st::Expression::Literal::kI64:
+      return Datum(static_cast<int64_t>(lit.i64()));
+
+    case st::Expression::Literal::kFp32:
+      return Datum(lit.fp32());
+    case st::Expression::Literal::kFp64:
+      return Datum(lit.fp64());
+
+    case st::Expression::Literal::kString:
+      return Datum(lit.string());
+    case st::Expression::Literal::kBinary:
+      return 
Datum(std::make_shared<BinaryScalar>(Buffer::FromString(lit.binary())));
+
+    case st::Expression::Literal::kTimestamp:
+      return Datum(std::make_shared<TimestampScalar>(
+          static_cast<int64_t>(lit.timestamp()), TimeUnit::MICRO));
+
+    case st::Expression::Literal::kTimestampTz:
+      return Datum(std::make_shared<TimestampScalar>(
+          static_cast<int64_t>(lit.timestamp_tz()), TimeUnit::MICRO,
+          TimestampTzTimezoneString()));
+
+    case st::Expression::Literal::kDate:
+      return 
Datum(std::make_shared<Date32Scalar>(static_cast<int32_t>(lit.date())));
+    case st::Expression::Literal::kTime:
+      return 
Datum(std::make_shared<Time64Scalar>(static_cast<int64_t>(lit.time()),
+                                                  TimeUnit::MICRO));
+
+    case st::Expression::Literal::kIntervalYearToMonth:
+    case st::Expression::Literal::kIntervalDayToSecond:
+      break;
+
+    case st::Expression::Literal::kUuid:
+      return Datum(std::make_shared<ExtensionScalar>(
+          FixedSizeBinaryScalarFromBytes(lit.uuid()), uuid()));
+
+    case st::Expression::Literal::kFixedChar:
+      return Datum(std::make_shared<ExtensionScalar>(
+          FixedSizeBinaryScalarFromBytes(lit.fixed_char()),
+          fixed_char(static_cast<int32_t>(lit.fixed_char().size()))));
+
+    case st::Expression::Literal::kVarChar:
+      // FIXME
+      // There's no way to determine VarChar.length from the literal
+      break;
+
+    case st::Expression::Literal::kFixedBinary:
+      return Datum(FixedSizeBinaryScalarFromBytes(lit.fixed_char()));
+
+    case st::Expression::Literal::kDecimal:
+      if (lit.decimal().size() != sizeof(Decimal128)) {
+        return Status::Invalid("Decimal literal had ", lit.decimal().size(),
+                               " bytes (expected ", sizeof(Decimal128), ")");
+      }
+
+      // FIXME
+      // It's not clear how these bytes should be interpreted...
+      // Furthermore, there's no way to determine scale or precision
+      break;
+
+    case st::Expression::Literal::kStruct: {
+      const auto& struct_ = lit.struct_();
+
+      ScalarVector fields(struct_.fields_size());
+      std::vector<std::string> field_names(fields.size(), "");
+      for (size_t i = 0; i < fields.size(); ++i) {
+        ARROW_ASSIGN_OR_RAISE(auto field, FromProto(struct_.fields(i)));
+        DCHECK(field.is_scalar());
+        fields.push_back(field.scalar());
+      }
+      ARROW_ASSIGN_OR_RAISE(
+          auto scalar, StructScalar::Make(std::move(fields), 
std::move(field_names)));
+      return Datum(std::move(scalar));
+    }
+
+      // case st::Expression::Literal::kNamedStruct:
+
+    case st::Expression::Literal::kList: {
+      const auto& list = lit.list();
+
+      // FIXME
+      // No way to determine list value type for empty list literals
+      DCHECK_NE(list.values_size(), 0);
+
+      ScalarVector values(list.values_size());
+      for (size_t i = 0; i < values.size(); ++i) {
+        ARROW_ASSIGN_OR_RAISE(auto value, FromProto(list.values(i)));
+        DCHECK(value.is_scalar());
+        values.push_back(value.scalar());
+      }
+
+      ARROW_ASSIGN_OR_RAISE(auto builder, MakeBuilder(values[0]->type));
+      RETURN_NOT_OK(builder->AppendScalars(values));
+      ARROW_ASSIGN_OR_RAISE(auto arr, builder->Finish());
+      return Datum(std::make_shared<ListScalar>(std::move(arr)));
+    }
+
+    case st::Expression::Literal::kMap: {
+      const auto& map = lit.map();
+
+      // FIXME
+      // No way to determine list value type for empty list literals
+      DCHECK_NE(map.key_values_size(), 0);
+
+      ScalarVector keys(map.key_values_size()), values(map.key_values_size());
+      for (size_t i = 0; i < values.size(); ++i) {
+        const auto& kv = map.key_values(i);
+
+        static const std::array<char const*, 4> kMissing = {"key and value", 
"value",
+                                                            "key", nullptr};
+        if (auto missing = kMissing[kv.has_key() + kv.has_value() * 2]) {
+          return Status::Invalid("While converting to MapScalar encountered 
missing ",
+                                 missing, " in ", map.DebugString());
+        }
+        ARROW_ASSIGN_OR_RAISE(auto key, FromProto(kv.key()));
+        ARROW_ASSIGN_OR_RAISE(auto value, FromProto(kv.value()));
+
+        DCHECK(key.is_scalar());
+        DCHECK(value.is_scalar());
+
+        keys.push_back(key.scalar());
+        values.push_back(value.scalar());
+      }
+
+      ARROW_ASSIGN_OR_RAISE(auto key_builder, MakeBuilder(keys[0]->type));
+      ARROW_ASSIGN_OR_RAISE(auto value_builder, MakeBuilder(keys[0]->type));
+      RETURN_NOT_OK(key_builder->AppendScalars(keys));
+      RETURN_NOT_OK(value_builder->AppendScalars(values));
+      ARROW_ASSIGN_OR_RAISE(auto key_arr, key_builder->Finish());
+      ARROW_ASSIGN_OR_RAISE(auto value_arr, value_builder->Finish());
+      ARROW_ASSIGN_OR_RAISE(
+          auto kv_arr,
+          StructArray::Make(ArrayVector{std::move(key_arr), 
std::move(value_arr)},
+                            std::vector<std::string>{"key", "value"}));
+      return Datum(std::make_shared<MapScalar>(std::move(kv_arr)));
+    }
+
+    case st::Expression::Literal::kNull: {
+      ARROW_ASSIGN_OR_RAISE(auto type_nullable, FromProto(lit.null()));
+      if (!type_nullable.second) {
+        return Status::Invalid("Null literal ", lit.DebugString(),
+                               " is of non-nullable type");
+      }
+
+      return Datum(MakeNullScalar(std::move(type_nullable.first)));
+    }
+
+    default:
+      break;
+  }
+
+  return Status::NotImplemented("conversion to arrow::Datum from ", 
lit.DebugString());
+}
+
+namespace {
+struct ToProtoImpl {
+  Status Visit(const NullScalar& s) { return NotImplemented(s); }
+
+  using Lit = st::Expression::Literal;
+
+  template <typename Arg, typename PrimitiveScalar>
+  Status Primitive(void (st::Expression::Literal::*set)(Arg),
+                   const PrimitiveScalar& primitive_scalar) {
+    (type_->*set)(static_cast<Arg>(primitive_scalar.value));
+    return Status::OK();
+  }
+
+  template <typename ScalarWithBufferValue>
+  Status FromBuffer(void (st::Expression::Literal::*set)(std::string&&),
+                    const ScalarWithBufferValue& scalar_with_buffer) {
+    (type_->*set)(util::string_view{*scalar_with_buffer.value}.to_string());
+    return Status::OK();
+  }
+
+  Status Visit(const BooleanScalar& s) { return Primitive(&Lit::set_boolean, 
s); }
+
+  Status Visit(const Int8Scalar& s) { return Primitive(&Lit::set_i8, s); }
+  Status Visit(const Int16Scalar& s) { return Primitive(&Lit::set_i16, s); }
+  Status Visit(const Int32Scalar& s) { return Primitive(&Lit::set_i32, s); }
+  Status Visit(const Int64Scalar& s) { return Primitive(&Lit::set_i64, s); }
+
+  Status Visit(const UInt8Scalar& s) { return NotImplemented(s); }
+  Status Visit(const UInt16Scalar& s) { return NotImplemented(s); }
+  Status Visit(const UInt32Scalar& s) { return NotImplemented(s); }
+  Status Visit(const UInt64Scalar& s) { return NotImplemented(s); }
+
+  Status Visit(const HalfFloatScalar& s) { return NotImplemented(s); }
+  Status Visit(const FloatScalar& s) { return Primitive(&Lit::set_fp32, s); }
+  Status Visit(const DoubleScalar& s) { return Primitive(&Lit::set_fp64, s); }
+
+  Status Visit(const StringScalar& s) { return FromBuffer(&Lit::set_string, 
s); }
+  Status Visit(const BinaryScalar& s) { return FromBuffer(&Lit::set_binary, 
s); }
+
+  Status Visit(const FixedSizeBinaryScalar& s) {
+    return FromBuffer(&Lit::set_fixed_binary, s);
+  }
+
+  Status Visit(const Date32Scalar& s) { return Primitive(&Lit::set_date, s); }
+  Status Visit(const Date64Scalar& s) { return NotImplemented(s); }
+
+  Status Visit(const TimestampScalar& s) {
+    const auto& t = internal::checked_cast<const TimestampType&>(*s.type);
+
+    if (t.unit() != TimeUnit::MICRO) return NotImplemented(s);

Review comment:
       Could this be a `TODO`?

##########
File path: cpp/src/arrow/engine/substrait/type_internal.cc
##########
@@ -0,0 +1,434 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/engine/substrait/type_internal.h"
+
+#include <string>
+#include <vector>
+
+#include "arrow/buffer.h"
+#include "arrow/engine/substrait/extension_types.h"
+#include "arrow/result.h"
+#include "arrow/status.h"
+#include "arrow/type.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/make_unique.h"
+#include "arrow/visitor_inline.h"
+
+namespace arrow {
+namespace engine {
+namespace {
+
+template <typename TypeMessage>
+Status CheckVariation(const TypeMessage& type) {
+  if (!type.has_variation()) return Status::OK();
+  return Status::NotImplemented("Type.Variation for ", type.DebugString());
+}
+
+template <typename TypeMessage>
+bool IsNullable(const TypeMessage& type) {
+  return type.nullability() == st::Type_Nullability_NULLABLE;
+}
+
+template <typename ArrowType, typename TypeMessage, typename... A>
+Result<std::pair<std::shared_ptr<DataType>, bool>> FromProtoImpl(const 
TypeMessage& type,
+                                                                 A&&... args) {
+  RETURN_NOT_OK(CheckVariation(type));
+
+  return std::make_pair(std::static_pointer_cast<DataType>(
+                            
std::make_shared<ArrowType>(std::forward<A>(args)...)),
+                        IsNullable(type));
+}
+
+template <typename TypeMessage, typename... A>
+Result<std::pair<std::shared_ptr<DataType>, bool>> FromProtoImpl(
+    const TypeMessage& type, std::shared_ptr<DataType> type_factory(A...), 
A&&... args) {
+  RETURN_NOT_OK(CheckVariation(type));
+
+  return std::make_pair(
+      
std::static_pointer_cast<DataType>(type_factory(std::forward<A>(args)...)),
+      IsNullable(type));
+}
+
+template <typename Types, typename Names = const std::string*>
+Result<FieldVector> FieldsFromProto(
+    int size, const Types& types,
+    const Names* names = static_cast<const Names*>(nullptr)) {
+  FieldVector fields(size);
+  for (int i = 0; i < size; ++i) {
+    ARROW_ASSIGN_OR_RAISE(auto type_nullable, FromProto(types[i]));
+
+    std::string name = names ? std::move((*names)[i]) : "";
+    fields[i] =
+        field(std::move(name), std::move(type_nullable.first), 
type_nullable.second);
+  }
+  return fields;
+}
+
+}  // namespace
+
+Result<std::pair<std::shared_ptr<DataType>, bool>> FromProto(const st::Type& 
type) {
+  switch (type.kind_case()) {
+    case st::Type::kBool:
+      return FromProtoImpl<BooleanType>(type.bool_());
+
+    case st::Type::kI8:
+      return FromProtoImpl<Int8Type>(type.i8());
+    case st::Type::kI16:
+      return FromProtoImpl<Int16Type>(type.i16());
+    case st::Type::kI32:
+      return FromProtoImpl<Int32Type>(type.i32());
+    case st::Type::kI64:
+      return FromProtoImpl<Int64Type>(type.i64());
+
+    case st::Type::kFp32:
+      return FromProtoImpl<FloatType>(type.fp32());
+    case st::Type::kFp64:
+      return FromProtoImpl<DoubleType>(type.fp64());
+
+    case st::Type::kString:
+      return FromProtoImpl<StringType>(type.string());
+    case st::Type::kBinary:
+      return FromProtoImpl<BinaryType>(type.binary());
+
+    case st::Type::kTimestamp:
+      return FromProtoImpl<TimestampType>(type.timestamp(), TimeUnit::MICRO);
+    case st::Type::kTimestampTz:
+      return FromProtoImpl<TimestampType>(type.timestamp_tz(), TimeUnit::MICRO,
+                                          TimestampTzTimezoneString());
+    case st::Type::kDate:
+      // FIXME
+      // Substrait uses uint32_t to store dates, and further restricts the 
allowed
+      // range of dates to [1000-01-01..9999-12-31]. Does this mean the value 
should
+      // be interpreted as an offset from 1000-01-01 instead of the epoch? Or 
should
+      // the value be signed instead?
+      // Furthermore, simple_logical_types.md states that the equivalent arrow 
type
+      // is Date64 (which measures milliseconds rather than days). Is that 
incorrect?

Review comment:
       The current substrait docs on dates are not fully specified and yes 
likely incorrect on this type.

##########
File path: cpp/src/arrow/engine/substrait/type_internal.cc
##########
@@ -0,0 +1,434 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/engine/substrait/type_internal.h"
+
+#include <string>
+#include <vector>
+
+#include "arrow/buffer.h"
+#include "arrow/engine/substrait/extension_types.h"
+#include "arrow/result.h"
+#include "arrow/status.h"
+#include "arrow/type.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/make_unique.h"
+#include "arrow/visitor_inline.h"
+
+namespace arrow {
+namespace engine {
+namespace {
+
+template <typename TypeMessage>
+Status CheckVariation(const TypeMessage& type) {
+  if (!type.has_variation()) return Status::OK();
+  return Status::NotImplemented("Type.Variation for ", type.DebugString());
+}
+
+template <typename TypeMessage>
+bool IsNullable(const TypeMessage& type) {
+  return type.nullability() == st::Type_Nullability_NULLABLE;
+}
+
+template <typename ArrowType, typename TypeMessage, typename... A>
+Result<std::pair<std::shared_ptr<DataType>, bool>> FromProtoImpl(const 
TypeMessage& type,
+                                                                 A&&... args) {
+  RETURN_NOT_OK(CheckVariation(type));
+
+  return std::make_pair(std::static_pointer_cast<DataType>(
+                            
std::make_shared<ArrowType>(std::forward<A>(args)...)),
+                        IsNullable(type));
+}
+
+template <typename TypeMessage, typename... A>
+Result<std::pair<std::shared_ptr<DataType>, bool>> FromProtoImpl(
+    const TypeMessage& type, std::shared_ptr<DataType> type_factory(A...), 
A&&... args) {
+  RETURN_NOT_OK(CheckVariation(type));
+
+  return std::make_pair(
+      
std::static_pointer_cast<DataType>(type_factory(std::forward<A>(args)...)),
+      IsNullable(type));
+}
+
+template <typename Types, typename Names = const std::string*>
+Result<FieldVector> FieldsFromProto(
+    int size, const Types& types,
+    const Names* names = static_cast<const Names*>(nullptr)) {
+  FieldVector fields(size);
+  for (int i = 0; i < size; ++i) {
+    ARROW_ASSIGN_OR_RAISE(auto type_nullable, FromProto(types[i]));
+
+    std::string name = names ? std::move((*names)[i]) : "";
+    fields[i] =
+        field(std::move(name), std::move(type_nullable.first), 
type_nullable.second);
+  }
+  return fields;
+}
+
+}  // namespace
+
+Result<std::pair<std::shared_ptr<DataType>, bool>> FromProto(const st::Type& 
type) {
+  switch (type.kind_case()) {
+    case st::Type::kBool:
+      return FromProtoImpl<BooleanType>(type.bool_());
+
+    case st::Type::kI8:
+      return FromProtoImpl<Int8Type>(type.i8());
+    case st::Type::kI16:
+      return FromProtoImpl<Int16Type>(type.i16());
+    case st::Type::kI32:
+      return FromProtoImpl<Int32Type>(type.i32());
+    case st::Type::kI64:
+      return FromProtoImpl<Int64Type>(type.i64());
+
+    case st::Type::kFp32:
+      return FromProtoImpl<FloatType>(type.fp32());
+    case st::Type::kFp64:
+      return FromProtoImpl<DoubleType>(type.fp64());
+
+    case st::Type::kString:
+      return FromProtoImpl<StringType>(type.string());
+    case st::Type::kBinary:
+      return FromProtoImpl<BinaryType>(type.binary());
+
+    case st::Type::kTimestamp:
+      return FromProtoImpl<TimestampType>(type.timestamp(), TimeUnit::MICRO);
+    case st::Type::kTimestampTz:
+      return FromProtoImpl<TimestampType>(type.timestamp_tz(), TimeUnit::MICRO,
+                                          TimestampTzTimezoneString());
+    case st::Type::kDate:
+      // FIXME
+      // Substrait uses uint32_t to store dates, and further restricts the 
allowed
+      // range of dates to [1000-01-01..9999-12-31]. Does this mean the value 
should
+      // be interpreted as an offset from 1000-01-01 instead of the epoch? Or 
should
+      // the value be signed instead?
+      // Furthermore, simple_logical_types.md states that the equivalent arrow 
type
+      // is Date64 (which measures milliseconds rather than days). Is that 
incorrect?
+      return FromProtoImpl<Date32Type>(type.date());
+
+    case st::Type::kTime:
+      return FromProtoImpl<Time64Type>(type.time(), TimeUnit::MICRO);
+
+    case st::Type::kIntervalYear:
+      // FIXME
+      // None of MonthIntervalType, DayTimeIntervalType, 
MonthDayNanoIntervalType
+      // corresponds; none has a year field. Lossy conversion to 
MonthIntervalType
+      // would be possible...

Review comment:
       This Substrait type may require an extension type.




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


Reply via email to