This is an automated email from the ASF dual-hosted git repository.

westonpace pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new ad78a31f55 ARROW-16816: [C++] Upgrade Substrait to v0.6.0 (#13468)
ad78a31f55 is described below

commit ad78a31f55a051af8d85ae761dff20055789fe83
Author: Jeroen van Straten <[email protected]>
AuthorDate: Tue Jul 5 18:38:13 2022 +0200

    ARROW-16816: [C++] Upgrade Substrait to v0.6.0 (#13468)
    
    Note: I actually upgraded to v0.6.0; it didn't make much sense to me to not 
just go to the latest release. I guess I'll downgrade if there was a specific 
reason for going to exactly v0.3.0 that I'm not aware of.
    
    Stuff that broke:
     - `relations.proto` and `expressions.proto` were merged into 
`algebra.proto` in https://github.com/substrait-io/substrait/pull/136
     - Breaking change in how file formats are specified in read relations: 
https://github.com/substrait-io/substrait/pull/169
     - Deprecation in specification of function arguments, switched to the new 
format (supporting the old one as well would be a bit more work, which I'm not 
sure is worthwhile at this stage): 
https://github.com/substrait-io/substrait/pull/161
     - Deprecation of `user_defined_type_reference` in `Type`, replacing it 
with a message that also supports nullability: 
https://github.com/substrait-io/substrait/pull/217
    
    Authored-by: Jeroen van Straten <[email protected]>
    Signed-off-by: Weston Pace <[email protected]>
---
 cpp/cmake_modules/ThirdpartyToolchain.cmake        |  13 +--
 .../arrow/engine/substrait/expression_internal.cc  |  21 +++-
 .../arrow/engine/substrait/expression_internal.h   |   2 +-
 .../arrow/engine/substrait/relation_internal.cc    |  21 ++--
 cpp/src/arrow/engine/substrait/relation_internal.h |   2 +-
 cpp/src/arrow/engine/substrait/serde_test.cc       | 118 ++++++++++++---------
 cpp/src/arrow/engine/substrait/type_internal.cc    |  13 ++-
 cpp/thirdparty/versions.txt                        |   4 +-
 python/pyarrow/tests/test_substrait.py             |   3 +-
 r/tests/testthat/test-query-engine.R               |   2 +-
 10 files changed, 111 insertions(+), 88 deletions(-)

diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake 
b/cpp/cmake_modules/ThirdpartyToolchain.cmake
index 8dea78eec9..10654d02a9 100644
--- a/cpp/cmake_modules/ThirdpartyToolchain.cmake
+++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake
@@ -1685,16 +1685,9 @@ endif()
 macro(build_substrait)
   message(STATUS "Building Substrait from source")
 
-  set(SUBSTRAIT_PROTOS
-      capabilities
-      expression
-      extensions/extensions
-      function
-      parameterized_types
-      plan
-      relations
-      type
-      type_expressions)
+  # Note: not all protos in Substrait actually matter to plan
+  # consumption. No need to build the ones we don't need.
+  set(SUBSTRAIT_PROTOS algebra extensions/extensions plan type)
 
   externalproject_add(substrait_ep
                       CONFIGURE_COMMAND ""
diff --git a/cpp/src/arrow/engine/substrait/expression_internal.cc 
b/cpp/src/arrow/engine/substrait/expression_internal.cc
index 5d7d66225e..694edd614b 100644
--- a/cpp/src/arrow/engine/substrait/expression_internal.cc
+++ b/cpp/src/arrow/engine/substrait/expression_internal.cc
@@ -160,9 +160,18 @@ Result<compute::Expression> FromProto(const 
substrait::Expression& expr,
       ARROW_ASSIGN_OR_RAISE(auto decoded_function,
                             
ext_set.DecodeFunction(scalar_fn.function_reference()));
 
-      std::vector<compute::Expression> arguments(scalar_fn.args_size());
-      for (int i = 0; i < scalar_fn.args_size(); ++i) {
-        ARROW_ASSIGN_OR_RAISE(arguments[i], FromProto(scalar_fn.args(i), 
ext_set));
+      std::vector<compute::Expression> arguments(scalar_fn.arguments_size());
+      for (int i = 0; i < scalar_fn.arguments_size(); ++i) {
+        const auto& argument = scalar_fn.arguments(i);
+        switch (argument.arg_type_case()) {
+          case substrait::FunctionArgument::kValue: {
+            ARROW_ASSIGN_OR_RAISE(arguments[i], FromProto(argument.value(), 
ext_set));
+            break;
+          }
+          default:
+            return Status::NotImplemented(
+                "only value arguments are currently supported for functions");
+        }
       }
 
       auto func_name = decoded_function.name.to_string();
@@ -900,9 +909,11 @@ Result<std::unique_ptr<substrait::Expression>> 
ToProto(const compute::Expression
 
   auto scalar_fn = 
internal::make_unique<substrait::Expression::ScalarFunction>();
   scalar_fn->set_function_reference(anchor);
-  scalar_fn->mutable_args()->Reserve(static_cast<int>(arguments.size()));
+  scalar_fn->mutable_arguments()->Reserve(static_cast<int>(arguments.size()));
   for (auto& arg : arguments) {
-    scalar_fn->mutable_args()->AddAllocated(arg.release());
+    auto argument = internal::make_unique<substrait::FunctionArgument>();
+    argument->set_allocated_value(arg.release());
+    scalar_fn->mutable_arguments()->AddAllocated(argument.release());
   }
 
   out->set_allocated_scalar_function(scalar_fn.release());
diff --git a/cpp/src/arrow/engine/substrait/expression_internal.h 
b/cpp/src/arrow/engine/substrait/expression_internal.h
index 6bbc2d8c76..4e23dc8f70 100644
--- a/cpp/src/arrow/engine/substrait/expression_internal.h
+++ b/cpp/src/arrow/engine/substrait/expression_internal.h
@@ -26,7 +26,7 @@
 #include "arrow/engine/substrait/visibility.h"
 #include "arrow/type_fwd.h"
 
-#include "substrait/expression.pb.h"  // IWYU pragma: export
+#include "substrait/algebra.pb.h"  // IWYU pragma: export
 
 namespace arrow {
 namespace engine {
diff --git a/cpp/src/arrow/engine/substrait/relation_internal.cc 
b/cpp/src/arrow/engine/substrait/relation_internal.cc
index 61e2986551..dce66eccf8 100644
--- a/cpp/src/arrow/engine/substrait/relation_internal.cc
+++ b/cpp/src/arrow/engine/substrait/relation_internal.cc
@@ -109,17 +109,16 @@ Result<compute::Declaration> FromProto(const 
substrait::Rel& rel,
           path = item.uri_path_glob();
         }
 
-        if (item.format() ==
-            substrait::ReadRel::LocalFiles::FileOrFiles::FILE_FORMAT_PARQUET) {
-          format = std::make_shared<dataset::ParquetFileFormat>();
-        } else if (util::string_view{path}.ends_with(".arrow")) {
-          format = std::make_shared<dataset::IpcFileFormat>();
-        } else if (util::string_view{path}.ends_with(".feather")) {
-          format = std::make_shared<dataset::IpcFileFormat>();
-        } else {
-          return Status::NotImplemented(
-              "substrait::ReadRel::LocalFiles::FileOrFiles::format "
-              "other than FILE_FORMAT_PARQUET");
+        switch (item.file_format_case()) {
+          case substrait::ReadRel_LocalFiles_FileOrFiles::kParquet:
+            format = std::make_shared<dataset::ParquetFileFormat>();
+            break;
+          case substrait::ReadRel_LocalFiles_FileOrFiles::kArrow:
+            format = std::make_shared<dataset::IpcFileFormat>();
+            break;
+          default:
+            return Status::NotImplemented(
+                "unknown 
substrait::ReadRel::LocalFiles::FileOrFiles::file_format");
         }
 
         if (!util::string_view{path}.starts_with("file:///")) {
diff --git a/cpp/src/arrow/engine/substrait/relation_internal.h 
b/cpp/src/arrow/engine/substrait/relation_internal.h
index 77d47c586b..ec56a2d359 100644
--- a/cpp/src/arrow/engine/substrait/relation_internal.h
+++ b/cpp/src/arrow/engine/substrait/relation_internal.h
@@ -25,7 +25,7 @@
 #include "arrow/engine/substrait/visibility.h"
 #include "arrow/type_fwd.h"
 
-#include "substrait/relations.pb.h"  // IWYU pragma: export
+#include "substrait/algebra.pb.h"  // IWYU pragma: export
 
 namespace arrow {
 namespace engine {
diff --git a/cpp/src/arrow/engine/substrait/serde_test.cc 
b/cpp/src/arrow/engine/substrait/serde_test.cc
index e200c7d992..e10082392d 100644
--- a/cpp/src/arrow/engine/substrait/serde_test.cc
+++ b/cpp/src/arrow/engine/substrait/serde_test.cc
@@ -204,7 +204,8 @@ TEST(Substrait, SupportedExtensionTypes) {
     ASSERT_OK_AND_ASSIGN(
         auto buf,
         internal::SubstraitFromJSON(
-            "Type", "{\"user_defined_type_reference\": " + 
std::to_string(anchor) + "}"));
+            "Type", "{\"user_defined\": { \"type_reference\": " + 
std::to_string(anchor) +
+                        ", \"nullability\": \"NULLABILITY_NULLABLE\" } }"));
 
     ASSERT_OK_AND_ASSIGN(auto type, DeserializeType(*buf, ext_set));
     EXPECT_EQ(*type, *expected_type);
@@ -279,8 +280,9 @@ TEST(Substrait, NamedStruct) {
 }
 
 TEST(Substrait, NoEquivalentArrowType) {
-  ASSERT_OK_AND_ASSIGN(auto buf, internal::SubstraitFromJSON(
-                                     "Type", 
R"({"user_defined_type_reference": 99})"));
+  ASSERT_OK_AND_ASSIGN(
+      auto buf,
+      internal::SubstraitFromJSON("Type", R"({"user_defined": 
{"type_reference": 99}})"));
   ExtensionSet empty;
   ASSERT_THAT(
       DeserializeType(*buf, empty),
@@ -650,11 +652,11 @@ TEST(Substrait, ReadRel) {
         "items": [
           {
             "uri_file": "file:///tmp/dat1.parquet",
-            "format": "FILE_FORMAT_PARQUET"
+            "parquet": {}
           },
           {
             "uri_file": "file:///tmp/dat2.parquet",
-            "format": "FILE_FORMAT_PARQUET"
+            "parquet": {}
           }
         ]
       }
@@ -879,7 +881,7 @@ Result<std::string> GetSubstraitJSON() {
             "items": [
               {
                 "uri_file": "file://FILENAME_PLACEHOLDER",
-                "format": "FILE_FORMAT_PARQUET"
+                "parquet": {}
               }
             ]
           }
@@ -1011,7 +1013,7 @@ TEST(Substrait, JoinPlanBasic) {
               "items": [
                 {
                   "uri_file": "file:///tmp/dat1.parquet",
-                  "format": "FILE_FORMAT_PARQUET"
+                  "parquet": {}
                 }
               ]
             }
@@ -1035,7 +1037,7 @@ TEST(Substrait, JoinPlanBasic) {
               "items": [
                 {
                   "uri_file": "file:///tmp/dat2.parquet",
-                  "format": "FILE_FORMAT_PARQUET"
+                  "parquet": {}
                 }
               ]
             }
@@ -1044,24 +1046,28 @@ TEST(Substrait, JoinPlanBasic) {
         "expression": {
           "scalarFunction": {
             "functionReference": 0,
-            "args": [{
-              "selection": {
-                "directReference": {
-                  "structField": {
-                    "field": 0
+            "arguments": [{
+              "value": {
+                "selection": {
+                  "directReference": {
+                    "structField": {
+                      "field": 0
+                    }
+                  },
+                  "rootReference": {
                   }
-                },
-                "rootReference": {
                 }
               }
             }, {
-              "selection": {
-                "directReference": {
-                  "structField": {
-                    "field": 5
+              "value": {
+                "selection": {
+                  "directReference": {
+                    "structField": {
+                      "field": 5
+                    }
+                  },
+                  "rootReference": {
                   }
-                },
-                "rootReference": {
                 }
               }
             }]
@@ -1147,7 +1153,7 @@ TEST(Substrait, JoinPlanInvalidKeyCmp) {
               "items": [
                 {
                   "uri_file": "file:///tmp/dat1.parquet",
-                  "format": "FILE_FORMAT_PARQUET"
+                  "parquet": {}
                 }
               ]
             }
@@ -1171,7 +1177,7 @@ TEST(Substrait, JoinPlanInvalidKeyCmp) {
               "items": [
                 {
                   "uri_file": "file:///tmp/dat2.parquet",
-                  "format": "FILE_FORMAT_PARQUET"
+                  "parquet": {}
                 }
               ]
             }
@@ -1180,24 +1186,28 @@ TEST(Substrait, JoinPlanInvalidKeyCmp) {
         "expression": {
           "scalarFunction": {
             "functionReference": 0,
-            "args": [{
-              "selection": {
-                "directReference": {
-                  "structField": {
-                    "field": 0
+            "arguments": [{
+              "value": {
+                "selection": {
+                  "directReference": {
+                    "structField": {
+                      "field": 0
+                    }
+                  },
+                  "rootReference": {
                   }
-                },
-                "rootReference": {
                 }
               }
             }, {
-              "selection": {
-                "directReference": {
-                  "structField": {
-                    "field": 5
+              "value": {
+                "selection": {
+                  "directReference": {
+                    "structField": {
+                      "field": 5
+                    }
+                  },
+                  "rootReference": {
                   }
-                },
-                "rootReference": {
                 }
               }
             }]
@@ -1255,7 +1265,7 @@ TEST(Substrait, JoinPlanInvalidExpression) {
               "items": [
                 {
                   "uri_file": "file:///tmp/dat1.parquet",
-                  "format": "FILE_FORMAT_PARQUET"
+                  "parquet": {}
                 }
               ]
             }
@@ -1279,7 +1289,7 @@ TEST(Substrait, JoinPlanInvalidExpression) {
               "items": [
                 {
                   "uri_file": "file:///tmp/dat2.parquet",
-                  "format": "FILE_FORMAT_PARQUET"
+                  "parquet": {}
                 }
               ]
             }
@@ -1323,7 +1333,7 @@ TEST(Substrait, JoinPlanInvalidKeys) {
               "items": [
                 {
                   "uri_file": "file:///tmp/dat1.parquet",
-                  "format": "FILE_FORMAT_PARQUET"
+                  "parquet": {}
                 }
               ]
             }
@@ -1332,24 +1342,28 @@ TEST(Substrait, JoinPlanInvalidKeys) {
         "expression": {
           "scalarFunction": {
             "functionReference": 0,
-            "args": [{
-              "selection": {
-                "directReference": {
-                  "structField": {
-                    "field": 0
+            "arguments": [{
+              "value": {
+                "selection": {
+                  "directReference": {
+                    "structField": {
+                      "field": 0
+                    }
+                  },
+                  "rootReference": {
                   }
-                },
-                "rootReference": {
                 }
               }
             }, {
-              "selection": {
-                "directReference": {
-                  "structField": {
-                    "field": 5
+              "value": {
+                "selection": {
+                  "directReference": {
+                    "structField": {
+                      "field": 5
+                    }
+                  },
+                  "rootReference": {
                   }
-                },
-                "rootReference": {
                 }
               }
             }]
diff --git a/cpp/src/arrow/engine/substrait/type_internal.cc 
b/cpp/src/arrow/engine/substrait/type_internal.cc
index 178e6d25a5..6c65b32e2a 100644
--- a/cpp/src/arrow/engine/substrait/type_internal.cc
+++ b/cpp/src/arrow/engine/substrait/type_internal.cc
@@ -196,10 +196,11 @@ Result<std::pair<std::shared_ptr<DataType>, bool>> 
FromProto(
           field("value", std::move(value_nullable.first), 
value_nullable.second));
     }
 
-    case ::substrait::Type::kUserDefinedTypeReference: {
-      uint32_t anchor = type.user_defined_type_reference();
+    case ::substrait::Type::kUserDefined: {
+      const auto& user_defined = type.user_defined();
+      uint32_t anchor = user_defined.type_reference();
       ARROW_ASSIGN_OR_RAISE(auto type_record, ext_set.DecodeType(anchor));
-      return std::make_pair(std::move(type_record.type), true);
+      return std::make_pair(std::move(type_record.type), 
IsNullable(user_defined));
     }
 
     default:
@@ -389,7 +390,11 @@ struct DataTypeToProtoImpl {
   template <typename T>
   Status EncodeUserDefined(const T& t) {
     ARROW_ASSIGN_OR_RAISE(auto anchor, ext_set_->EncodeType(t));
-    type_->set_user_defined_type_reference(anchor);
+    auto user_defined = internal::make_unique<::substrait::Type_UserDefined>();
+    user_defined->set_type_reference(anchor);
+    user_defined->set_nullability(nullable_ ? 
::substrait::Type::NULLABILITY_NULLABLE
+                                            : 
::substrait::Type::NULLABILITY_REQUIRED);
+    type_->set_allocated_user_defined(user_defined.release());
     return Status::OK();
   }
 
diff --git a/cpp/thirdparty/versions.txt b/cpp/thirdparty/versions.txt
index 7dc95cd7e0..5adcb1a2f5 100644
--- a/cpp/thirdparty/versions.txt
+++ b/cpp/thirdparty/versions.txt
@@ -85,8 +85,8 @@ 
ARROW_SNAPPY_BUILD_SHA256_CHECKSUM=75c1fbb3d618dd3a0483bff0e26d0a92b495bbe5059c8
 # There is a bug in GCC < 4.9 with Snappy 1.1.9, so revert to 1.1.8 for those 
(ARROW-14661)
 ARROW_SNAPPY_OLD_BUILD_VERSION=1.1.8
 
ARROW_SNAPPY_OLD_BUILD_SHA256_CHECKSUM=16b677f07832a612b0836178db7f374e414f94657c138e6993cbfc5dcc58651f
-ARROW_SUBSTRAIT_BUILD_VERSION=e1b4c04a
-ARROW_SUBSTRAIT_BUILD_SHA256_CHECKSUM=65f83e5f5d979ede5fc8ac9f8bbaf793e0c72d9c415f1a162ba522f6d0bb5bbe
+ARROW_SUBSTRAIT_BUILD_VERSION=v0.6.0
+ARROW_SUBSTRAIT_BUILD_SHA256_CHECKSUM=7b8583b9684477e9027f417bbfb4febb8acfeb01923dcaa7cf0fd3f921d69c88
 ARROW_THRIFT_BUILD_VERSION=0.16.0
 
ARROW_THRIFT_BUILD_SHA256_CHECKSUM=f460b5c1ca30d8918ff95ea3eb6291b3951cf518553566088f3f2be8981f6209
 ARROW_UCX_BUILD_VERSION=1.12.1
diff --git a/python/pyarrow/tests/test_substrait.py 
b/python/pyarrow/tests/test_substrait.py
index 8df35bbba4..98c206fd7e 100644
--- a/python/pyarrow/tests/test_substrait.py
+++ b/python/pyarrow/tests/test_substrait.py
@@ -55,7 +55,8 @@ def test_run_serialized_query(tmpdir):
             "local_files": {
                 "items": [
                 {
-                    "uri_file": "file://FILENAME_PLACEHOLDER"
+                    "uri_file": "file://FILENAME_PLACEHOLDER",
+                    "arrow": {}
                 }
                 ]
             }
diff --git a/r/tests/testthat/test-query-engine.R 
b/r/tests/testthat/test-query-engine.R
index 8a67c1e764..dd87335f87 100644
--- a/r/tests/testthat/test-query-engine.R
+++ b/r/tests/testthat/test-query-engine.R
@@ -41,7 +41,7 @@ test_that("do_exec_plan_substrait can evaluate a simple 
plan", {
             "items": [
               {
                 "uri_file": "file://%s",
-                "format": "FILE_FORMAT_PARQUET"
+                "parquet": {}
               }
             ]
           }

Reply via email to