westonpace commented on code in PR #14118:
URL: https://github.com/apache/arrow/pull/14118#discussion_r1048186989
##########
cpp/src/arrow/engine/substrait/relation_internal.cc:
##########
@@ -562,6 +563,33 @@ Result<std::shared_ptr<Schema>> ExtractSchemaToBind(const
compute::Declaration&
} else if (declr.factory_name == "filter") {
auto input_declr = std::get<compute::Declaration>(declr.inputs[0]);
ARROW_ASSIGN_OR_RAISE(bind_schema, ExtractSchemaToBind(input_declr));
+ } else if (declr.factory_name == "project") {
+ auto input_declr = std::get<compute::Declaration>(declr.inputs[0]);
+ ARROW_ASSIGN_OR_RAISE(auto input_schema, ExtractSchemaToBind(input_declr));
+ const int num_fields_before_proj = input_schema->num_fields();
+ const auto& opts = checked_cast<const
compute::ProjectNodeOptions&>(*(declr.options));
+ const auto& exprs = opts.expressions;
+ int i = 0;
+ bind_schema = input_schema;
+ for (const auto& expr : exprs) {
+ std::shared_ptr<Field> project_field;
+ auto bound_expr = expr.Bind(*input_schema);
+ if (auto* expr_call = bound_expr->call()) {
+ project_field = field(expr_call->function_name,
+ expr_call->kernel->signature->out_type().type());
+ } else if (auto* field_ref = bound_expr->field_ref()) {
+ ARROW_ASSIGN_OR_RAISE(FieldPath field_path,
field_ref->FindOne(*input_schema));
+ ARROW_ASSIGN_OR_RAISE(project_field, field_path.Get(*input_schema));
+ } else if (auto* literal = bound_expr->literal()) {
+ project_field =
+ field("field_" + std::to_string(num_fields_before_proj + i),
literal->type());
+ }
+ ARROW_ASSIGN_OR_RAISE(
+ bind_schema, bind_schema->AddField(
+ num_fields_before_proj +
static_cast<int>(exprs.size()) - 1,
+ std::move(project_field)));
Review Comment:
> Ah Is it because internal::AddVectorElement function used within AddField
is going to be expensive? (I see loops)
Yes. Schema is immutable. So the only way `AddField` can work is if it
creates an entirely new schema object which means it has to copy all the
vectors and shared pointers. Ideally we only have to do this once.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]