This is an automated email from the ASF dual-hosted git repository.
westonpace pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new ad78a31f55 ARROW-16816: [C++] Upgrade Substrait to v0.6.0 (#13468)
ad78a31f55 is described below
commit ad78a31f55a051af8d85ae761dff20055789fe83
Author: Jeroen van Straten <[email protected]>
AuthorDate: Tue Jul 5 18:38:13 2022 +0200
ARROW-16816: [C++] Upgrade Substrait to v0.6.0 (#13468)
Note: I actually upgraded to v0.6.0; it didn't make much sense to me to not
just go to the latest release. I guess I'll downgrade if there was a specific
reason for going to exactly v0.3.0 that I'm not aware of.
Stuff that broke:
- `relations.proto` and `expressions.proto` were merged into
`algebra.proto` in https://github.com/substrait-io/substrait/pull/136
- Breaking change in how file formats are specified in read relations:
https://github.com/substrait-io/substrait/pull/169
- Deprecation in specification of function arguments, switched to the new
format (supporting the old one as well would be a bit more work, which I'm not
sure is worthwhile at this stage):
https://github.com/substrait-io/substrait/pull/161
- Deprecation of `user_defined_type_reference` in `Type`, replacing it
with a message that also supports nullability:
https://github.com/substrait-io/substrait/pull/217
Authored-by: Jeroen van Straten <[email protected]>
Signed-off-by: Weston Pace <[email protected]>
---
cpp/cmake_modules/ThirdpartyToolchain.cmake | 13 +--
.../arrow/engine/substrait/expression_internal.cc | 21 +++-
.../arrow/engine/substrait/expression_internal.h | 2 +-
.../arrow/engine/substrait/relation_internal.cc | 21 ++--
cpp/src/arrow/engine/substrait/relation_internal.h | 2 +-
cpp/src/arrow/engine/substrait/serde_test.cc | 118 ++++++++++++---------
cpp/src/arrow/engine/substrait/type_internal.cc | 13 ++-
cpp/thirdparty/versions.txt | 4 +-
python/pyarrow/tests/test_substrait.py | 3 +-
r/tests/testthat/test-query-engine.R | 2 +-
10 files changed, 111 insertions(+), 88 deletions(-)
diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake
b/cpp/cmake_modules/ThirdpartyToolchain.cmake
index 8dea78eec9..10654d02a9 100644
--- a/cpp/cmake_modules/ThirdpartyToolchain.cmake
+++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake
@@ -1685,16 +1685,9 @@ endif()
macro(build_substrait)
message(STATUS "Building Substrait from source")
- set(SUBSTRAIT_PROTOS
- capabilities
- expression
- extensions/extensions
- function
- parameterized_types
- plan
- relations
- type
- type_expressions)
+ # Note: not all protos in Substrait actually matter to plan
+ # consumption. No need to build the ones we don't need.
+ set(SUBSTRAIT_PROTOS algebra extensions/extensions plan type)
externalproject_add(substrait_ep
CONFIGURE_COMMAND ""
diff --git a/cpp/src/arrow/engine/substrait/expression_internal.cc
b/cpp/src/arrow/engine/substrait/expression_internal.cc
index 5d7d66225e..694edd614b 100644
--- a/cpp/src/arrow/engine/substrait/expression_internal.cc
+++ b/cpp/src/arrow/engine/substrait/expression_internal.cc
@@ -160,9 +160,18 @@ Result<compute::Expression> FromProto(const
substrait::Expression& expr,
ARROW_ASSIGN_OR_RAISE(auto decoded_function,
ext_set.DecodeFunction(scalar_fn.function_reference()));
- std::vector<compute::Expression> arguments(scalar_fn.args_size());
- for (int i = 0; i < scalar_fn.args_size(); ++i) {
- ARROW_ASSIGN_OR_RAISE(arguments[i], FromProto(scalar_fn.args(i),
ext_set));
+ std::vector<compute::Expression> arguments(scalar_fn.arguments_size());
+ for (int i = 0; i < scalar_fn.arguments_size(); ++i) {
+ const auto& argument = scalar_fn.arguments(i);
+ switch (argument.arg_type_case()) {
+ case substrait::FunctionArgument::kValue: {
+ ARROW_ASSIGN_OR_RAISE(arguments[i], FromProto(argument.value(),
ext_set));
+ break;
+ }
+ default:
+ return Status::NotImplemented(
+ "only value arguments are currently supported for functions");
+ }
}
auto func_name = decoded_function.name.to_string();
@@ -900,9 +909,11 @@ Result<std::unique_ptr<substrait::Expression>>
ToProto(const compute::Expression
auto scalar_fn =
internal::make_unique<substrait::Expression::ScalarFunction>();
scalar_fn->set_function_reference(anchor);
- scalar_fn->mutable_args()->Reserve(static_cast<int>(arguments.size()));
+ scalar_fn->mutable_arguments()->Reserve(static_cast<int>(arguments.size()));
for (auto& arg : arguments) {
- scalar_fn->mutable_args()->AddAllocated(arg.release());
+ auto argument = internal::make_unique<substrait::FunctionArgument>();
+ argument->set_allocated_value(arg.release());
+ scalar_fn->mutable_arguments()->AddAllocated(argument.release());
}
out->set_allocated_scalar_function(scalar_fn.release());
diff --git a/cpp/src/arrow/engine/substrait/expression_internal.h
b/cpp/src/arrow/engine/substrait/expression_internal.h
index 6bbc2d8c76..4e23dc8f70 100644
--- a/cpp/src/arrow/engine/substrait/expression_internal.h
+++ b/cpp/src/arrow/engine/substrait/expression_internal.h
@@ -26,7 +26,7 @@
#include "arrow/engine/substrait/visibility.h"
#include "arrow/type_fwd.h"
-#include "substrait/expression.pb.h" // IWYU pragma: export
+#include "substrait/algebra.pb.h" // IWYU pragma: export
namespace arrow {
namespace engine {
diff --git a/cpp/src/arrow/engine/substrait/relation_internal.cc
b/cpp/src/arrow/engine/substrait/relation_internal.cc
index 61e2986551..dce66eccf8 100644
--- a/cpp/src/arrow/engine/substrait/relation_internal.cc
+++ b/cpp/src/arrow/engine/substrait/relation_internal.cc
@@ -109,17 +109,16 @@ Result<compute::Declaration> FromProto(const
substrait::Rel& rel,
path = item.uri_path_glob();
}
- if (item.format() ==
- substrait::ReadRel::LocalFiles::FileOrFiles::FILE_FORMAT_PARQUET) {
- format = std::make_shared<dataset::ParquetFileFormat>();
- } else if (util::string_view{path}.ends_with(".arrow")) {
- format = std::make_shared<dataset::IpcFileFormat>();
- } else if (util::string_view{path}.ends_with(".feather")) {
- format = std::make_shared<dataset::IpcFileFormat>();
- } else {
- return Status::NotImplemented(
- "substrait::ReadRel::LocalFiles::FileOrFiles::format "
- "other than FILE_FORMAT_PARQUET");
+ switch (item.file_format_case()) {
+ case substrait::ReadRel_LocalFiles_FileOrFiles::kParquet:
+ format = std::make_shared<dataset::ParquetFileFormat>();
+ break;
+ case substrait::ReadRel_LocalFiles_FileOrFiles::kArrow:
+ format = std::make_shared<dataset::IpcFileFormat>();
+ break;
+ default:
+ return Status::NotImplemented(
+ "unknown
substrait::ReadRel::LocalFiles::FileOrFiles::file_format");
}
if (!util::string_view{path}.starts_with("file:///")) {
diff --git a/cpp/src/arrow/engine/substrait/relation_internal.h
b/cpp/src/arrow/engine/substrait/relation_internal.h
index 77d47c586b..ec56a2d359 100644
--- a/cpp/src/arrow/engine/substrait/relation_internal.h
+++ b/cpp/src/arrow/engine/substrait/relation_internal.h
@@ -25,7 +25,7 @@
#include "arrow/engine/substrait/visibility.h"
#include "arrow/type_fwd.h"
-#include "substrait/relations.pb.h" // IWYU pragma: export
+#include "substrait/algebra.pb.h" // IWYU pragma: export
namespace arrow {
namespace engine {
diff --git a/cpp/src/arrow/engine/substrait/serde_test.cc
b/cpp/src/arrow/engine/substrait/serde_test.cc
index e200c7d992..e10082392d 100644
--- a/cpp/src/arrow/engine/substrait/serde_test.cc
+++ b/cpp/src/arrow/engine/substrait/serde_test.cc
@@ -204,7 +204,8 @@ TEST(Substrait, SupportedExtensionTypes) {
ASSERT_OK_AND_ASSIGN(
auto buf,
internal::SubstraitFromJSON(
- "Type", "{\"user_defined_type_reference\": " +
std::to_string(anchor) + "}"));
+ "Type", "{\"user_defined\": { \"type_reference\": " +
std::to_string(anchor) +
+ ", \"nullability\": \"NULLABILITY_NULLABLE\" } }"));
ASSERT_OK_AND_ASSIGN(auto type, DeserializeType(*buf, ext_set));
EXPECT_EQ(*type, *expected_type);
@@ -279,8 +280,9 @@ TEST(Substrait, NamedStruct) {
}
TEST(Substrait, NoEquivalentArrowType) {
- ASSERT_OK_AND_ASSIGN(auto buf, internal::SubstraitFromJSON(
- "Type",
R"({"user_defined_type_reference": 99})"));
+ ASSERT_OK_AND_ASSIGN(
+ auto buf,
+ internal::SubstraitFromJSON("Type", R"({"user_defined":
{"type_reference": 99}})"));
ExtensionSet empty;
ASSERT_THAT(
DeserializeType(*buf, empty),
@@ -650,11 +652,11 @@ TEST(Substrait, ReadRel) {
"items": [
{
"uri_file": "file:///tmp/dat1.parquet",
- "format": "FILE_FORMAT_PARQUET"
+ "parquet": {}
},
{
"uri_file": "file:///tmp/dat2.parquet",
- "format": "FILE_FORMAT_PARQUET"
+ "parquet": {}
}
]
}
@@ -879,7 +881,7 @@ Result<std::string> GetSubstraitJSON() {
"items": [
{
"uri_file": "file://FILENAME_PLACEHOLDER",
- "format": "FILE_FORMAT_PARQUET"
+ "parquet": {}
}
]
}
@@ -1011,7 +1013,7 @@ TEST(Substrait, JoinPlanBasic) {
"items": [
{
"uri_file": "file:///tmp/dat1.parquet",
- "format": "FILE_FORMAT_PARQUET"
+ "parquet": {}
}
]
}
@@ -1035,7 +1037,7 @@ TEST(Substrait, JoinPlanBasic) {
"items": [
{
"uri_file": "file:///tmp/dat2.parquet",
- "format": "FILE_FORMAT_PARQUET"
+ "parquet": {}
}
]
}
@@ -1044,24 +1046,28 @@ TEST(Substrait, JoinPlanBasic) {
"expression": {
"scalarFunction": {
"functionReference": 0,
- "args": [{
- "selection": {
- "directReference": {
- "structField": {
- "field": 0
+ "arguments": [{
+ "value": {
+ "selection": {
+ "directReference": {
+ "structField": {
+ "field": 0
+ }
+ },
+ "rootReference": {
}
- },
- "rootReference": {
}
}
}, {
- "selection": {
- "directReference": {
- "structField": {
- "field": 5
+ "value": {
+ "selection": {
+ "directReference": {
+ "structField": {
+ "field": 5
+ }
+ },
+ "rootReference": {
}
- },
- "rootReference": {
}
}
}]
@@ -1147,7 +1153,7 @@ TEST(Substrait, JoinPlanInvalidKeyCmp) {
"items": [
{
"uri_file": "file:///tmp/dat1.parquet",
- "format": "FILE_FORMAT_PARQUET"
+ "parquet": {}
}
]
}
@@ -1171,7 +1177,7 @@ TEST(Substrait, JoinPlanInvalidKeyCmp) {
"items": [
{
"uri_file": "file:///tmp/dat2.parquet",
- "format": "FILE_FORMAT_PARQUET"
+ "parquet": {}
}
]
}
@@ -1180,24 +1186,28 @@ TEST(Substrait, JoinPlanInvalidKeyCmp) {
"expression": {
"scalarFunction": {
"functionReference": 0,
- "args": [{
- "selection": {
- "directReference": {
- "structField": {
- "field": 0
+ "arguments": [{
+ "value": {
+ "selection": {
+ "directReference": {
+ "structField": {
+ "field": 0
+ }
+ },
+ "rootReference": {
}
- },
- "rootReference": {
}
}
}, {
- "selection": {
- "directReference": {
- "structField": {
- "field": 5
+ "value": {
+ "selection": {
+ "directReference": {
+ "structField": {
+ "field": 5
+ }
+ },
+ "rootReference": {
}
- },
- "rootReference": {
}
}
}]
@@ -1255,7 +1265,7 @@ TEST(Substrait, JoinPlanInvalidExpression) {
"items": [
{
"uri_file": "file:///tmp/dat1.parquet",
- "format": "FILE_FORMAT_PARQUET"
+ "parquet": {}
}
]
}
@@ -1279,7 +1289,7 @@ TEST(Substrait, JoinPlanInvalidExpression) {
"items": [
{
"uri_file": "file:///tmp/dat2.parquet",
- "format": "FILE_FORMAT_PARQUET"
+ "parquet": {}
}
]
}
@@ -1323,7 +1333,7 @@ TEST(Substrait, JoinPlanInvalidKeys) {
"items": [
{
"uri_file": "file:///tmp/dat1.parquet",
- "format": "FILE_FORMAT_PARQUET"
+ "parquet": {}
}
]
}
@@ -1332,24 +1342,28 @@ TEST(Substrait, JoinPlanInvalidKeys) {
"expression": {
"scalarFunction": {
"functionReference": 0,
- "args": [{
- "selection": {
- "directReference": {
- "structField": {
- "field": 0
+ "arguments": [{
+ "value": {
+ "selection": {
+ "directReference": {
+ "structField": {
+ "field": 0
+ }
+ },
+ "rootReference": {
}
- },
- "rootReference": {
}
}
}, {
- "selection": {
- "directReference": {
- "structField": {
- "field": 5
+ "value": {
+ "selection": {
+ "directReference": {
+ "structField": {
+ "field": 5
+ }
+ },
+ "rootReference": {
}
- },
- "rootReference": {
}
}
}]
diff --git a/cpp/src/arrow/engine/substrait/type_internal.cc
b/cpp/src/arrow/engine/substrait/type_internal.cc
index 178e6d25a5..6c65b32e2a 100644
--- a/cpp/src/arrow/engine/substrait/type_internal.cc
+++ b/cpp/src/arrow/engine/substrait/type_internal.cc
@@ -196,10 +196,11 @@ Result<std::pair<std::shared_ptr<DataType>, bool>>
FromProto(
field("value", std::move(value_nullable.first),
value_nullable.second));
}
- case ::substrait::Type::kUserDefinedTypeReference: {
- uint32_t anchor = type.user_defined_type_reference();
+ case ::substrait::Type::kUserDefined: {
+ const auto& user_defined = type.user_defined();
+ uint32_t anchor = user_defined.type_reference();
ARROW_ASSIGN_OR_RAISE(auto type_record, ext_set.DecodeType(anchor));
- return std::make_pair(std::move(type_record.type), true);
+ return std::make_pair(std::move(type_record.type),
IsNullable(user_defined));
}
default:
@@ -389,7 +390,11 @@ struct DataTypeToProtoImpl {
template <typename T>
Status EncodeUserDefined(const T& t) {
ARROW_ASSIGN_OR_RAISE(auto anchor, ext_set_->EncodeType(t));
- type_->set_user_defined_type_reference(anchor);
+ auto user_defined = internal::make_unique<::substrait::Type_UserDefined>();
+ user_defined->set_type_reference(anchor);
+ user_defined->set_nullability(nullable_ ?
::substrait::Type::NULLABILITY_NULLABLE
+ :
::substrait::Type::NULLABILITY_REQUIRED);
+ type_->set_allocated_user_defined(user_defined.release());
return Status::OK();
}
diff --git a/cpp/thirdparty/versions.txt b/cpp/thirdparty/versions.txt
index 7dc95cd7e0..5adcb1a2f5 100644
--- a/cpp/thirdparty/versions.txt
+++ b/cpp/thirdparty/versions.txt
@@ -85,8 +85,8 @@
ARROW_SNAPPY_BUILD_SHA256_CHECKSUM=75c1fbb3d618dd3a0483bff0e26d0a92b495bbe5059c8
# There is a bug in GCC < 4.9 with Snappy 1.1.9, so revert to 1.1.8 for those
(ARROW-14661)
ARROW_SNAPPY_OLD_BUILD_VERSION=1.1.8
ARROW_SNAPPY_OLD_BUILD_SHA256_CHECKSUM=16b677f07832a612b0836178db7f374e414f94657c138e6993cbfc5dcc58651f
-ARROW_SUBSTRAIT_BUILD_VERSION=e1b4c04a
-ARROW_SUBSTRAIT_BUILD_SHA256_CHECKSUM=65f83e5f5d979ede5fc8ac9f8bbaf793e0c72d9c415f1a162ba522f6d0bb5bbe
+ARROW_SUBSTRAIT_BUILD_VERSION=v0.6.0
+ARROW_SUBSTRAIT_BUILD_SHA256_CHECKSUM=7b8583b9684477e9027f417bbfb4febb8acfeb01923dcaa7cf0fd3f921d69c88
ARROW_THRIFT_BUILD_VERSION=0.16.0
ARROW_THRIFT_BUILD_SHA256_CHECKSUM=f460b5c1ca30d8918ff95ea3eb6291b3951cf518553566088f3f2be8981f6209
ARROW_UCX_BUILD_VERSION=1.12.1
diff --git a/python/pyarrow/tests/test_substrait.py
b/python/pyarrow/tests/test_substrait.py
index 8df35bbba4..98c206fd7e 100644
--- a/python/pyarrow/tests/test_substrait.py
+++ b/python/pyarrow/tests/test_substrait.py
@@ -55,7 +55,8 @@ def test_run_serialized_query(tmpdir):
"local_files": {
"items": [
{
- "uri_file": "file://FILENAME_PLACEHOLDER"
+ "uri_file": "file://FILENAME_PLACEHOLDER",
+ "arrow": {}
}
]
}
diff --git a/r/tests/testthat/test-query-engine.R
b/r/tests/testthat/test-query-engine.R
index 8a67c1e764..dd87335f87 100644
--- a/r/tests/testthat/test-query-engine.R
+++ b/r/tests/testthat/test-query-engine.R
@@ -41,7 +41,7 @@ test_that("do_exec_plan_substrait can evaluate a simple
plan", {
"items": [
{
"uri_file": "file://%s",
- "format": "FILE_FORMAT_PARQUET"
+ "parquet": {}
}
]
}