westonpace commented on code in PR #15183:
URL: https://github.com/apache/arrow/pull/15183#discussion_r1066419753
##########
cpp/examples/arrow/execution_plan_documentation_examples.cc:
##########
@@ -761,6 +761,40 @@ arrow::Status TableSinkExample() {
std::cout << "Results : " << output_table->ToString() << std::endl;
return arrow::Status::OK();
}
+
+// (Doc section: Table Sink Example)
+
+// (Doc section: RecordBatchReaderSource Example)
+
+/// \brief An example showing the usage of a RecordBatchReader as the data
source.
+///
+/// RecordBatchReaderSourceSink Example
+/// This example shows how a record_batch_reader_source can be used
+/// in an execution plan. This includes the source node
+/// receiving data from a TableRecordBatchReader.
+
+arrow::Status RecordBatchReaderSourceSinkExample() {
+ ARROW_ASSIGN_OR_RAISE(std::shared_ptr<cp::ExecPlan> plan,
+ cp::ExecPlan::Make(*cp::threaded_exec_context()));
+
+ std::cout << "basic data created" << std::endl;
Review Comment:
```suggestion
```
I know some of these examples print to standard out. However, most of the
examples are only printing the plan and the results. We should aim for
consistency.
##########
cpp/examples/arrow/execution_plan_documentation_examples.cc:
##########
@@ -761,6 +761,40 @@ arrow::Status TableSinkExample() {
std::cout << "Results : " << output_table->ToString() << std::endl;
return arrow::Status::OK();
}
+
+// (Doc section: Table Sink Example)
+
+// (Doc section: RecordBatchReaderSource Example)
+
+/// \brief An example showing the usage of a RecordBatchReader as the data
source.
+///
+/// RecordBatchReaderSourceSink Example
+/// This example shows how a record_batch_reader_source can be used
+/// in an execution plan. This includes the source node
+/// receiving data from a TableRecordBatchReader.
+
+arrow::Status RecordBatchReaderSourceSinkExample() {
+ ARROW_ASSIGN_OR_RAISE(std::shared_ptr<cp::ExecPlan> plan,
+ cp::ExecPlan::Make(*cp::threaded_exec_context()));
+
+ std::cout << "basic data created" << std::endl;
+
+ arrow::AsyncGenerator<std::optional<cp::ExecBatch>> sink_gen;
+ ARROW_ASSIGN_OR_RAISE(auto table, GetTable());
+ std::shared_ptr<arrow::RecordBatchReader> reader =
+ std::make_shared<arrow::TableBatchReader>(table);
+
+ ARROW_ASSIGN_OR_RAISE(cp::ExecNode * source,
+ cp::MakeExecNode("record_batch_reader_source",
plan.get(), {},
+
cp::RecordBatchReaderSourceNodeOptions{reader}));
+ ARROW_RETURN_NOT_OK(cp::MakeExecNode(
+ "order_by_sink", plan.get(), {source},
+ cp::OrderBySinkNodeOptions{
+ cp::SortOptions{{cp::SortKey{"a", cp::SortOrder::Descending}}},
&sink_gen}));
+
+ return ExecutePlanAndCollectAsTableWithCustomSink(plan, table->schema(),
sink_gen);
Review Comment:
```suggestion
ARROW_ASSIGN_OR_RAISE(auto table, GetTable());
std::shared_ptr<arrow::RecordBatchReader> reader =
std::make_shared<arrow::TableBatchReader>(table);
Declaration reader_source{"record_batch_reader_source",
cp::RecordBatchReaderSourceNodeOptions{reader}};
return ExecutePlanAndCollectAsTable(std::move(reader_source));
```
I don't think we need to use the order by sink. Let's base this on
SourceSinkExample.
##########
cpp/src/arrow/compute/exec/plan_test.cc:
##########
@@ -344,6 +344,40 @@ void TestSourceSink(
Finishes(ResultWith(UnorderedElementsAreArray(exp_batches.batches))));
}
+void TestRecordBatchReaderSourceSink(
+ std::function<Result<std::shared_ptr<RecordBatchReader>>(const
BatchesWithSchema&)>
+ to_reader) {
+ for (bool parallel : {false, true}) {
+ SCOPED_TRACE(parallel ? "parallel/merged" : "serial");
+ auto exp_batches = MakeBasicBatches();
+ ASSERT_OK_AND_ASSIGN(std::shared_ptr<RecordBatchReader> reader,
+ to_reader(exp_batches));
+ RecordBatchReaderSourceNodeOptions options{reader};
+ Declaration plan("record_batch_reader_source", std::move(options));
+ ASSERT_OK_AND_ASSIGN(auto result, DeclarationToExecBatches(plan, false));
Review Comment:
```suggestion
ASSERT_OK_AND_ASSIGN(auto result, DeclarationToExecBatches(plan,
parallel));
```
##########
cpp/src/arrow/compute/exec/source_node.cc:
##########
@@ -327,6 +327,53 @@ struct SchemaSourceNode : public SourceNode {
}
};
+struct RecordBatchReaderSourceNode : public SourceNode {
+ RecordBatchReaderSourceNode(ExecPlan* plan, std::shared_ptr<Schema> schema,
+ arrow::AsyncGenerator<std::optional<ExecBatch>>
generator)
+ : SourceNode(plan, schema, generator) {}
+
+ static Result<ExecNode*> Make(ExecPlan* plan, std::vector<ExecNode*> inputs,
+ const ExecNodeOptions& options) {
+ RETURN_NOT_OK(ValidateExecNodeInputs(plan, inputs, 0, kKindName));
+ const auto& cast_options =
+ checked_cast<const RecordBatchReaderSourceNodeOptions&>(options);
+ auto& reader = cast_options.reader;
+ auto io_executor = cast_options.io_executor;
+
+ if (reader == NULLPTR) {
+ return Status::Invalid(kKindName, " requires a reader which is not
null");
+ }
+
+ if (io_executor == NULLPTR) {
+ io_executor = io::internal::GetIOThreadPool();
+ }
+
+ ARROW_ASSIGN_OR_RAISE(auto generator, MakeGenerator(reader, io_executor));
+ return plan->EmplaceNode<RecordBatchReaderSourceNode>(plan,
reader->schema(),
+ generator);
+ }
+
+ static Result<arrow::AsyncGenerator<std::optional<ExecBatch>>> MakeGenerator(
+ const std::shared_ptr<RecordBatchReader>& reader,
+ arrow::internal::Executor* io_executor) {
+ const auto& schema = reader->schema();
Review Comment:
```suggestion
```
##########
cpp/src/arrow/compute/exec/source_node.cc:
##########
@@ -327,6 +327,53 @@ struct SchemaSourceNode : public SourceNode {
}
};
+struct RecordBatchReaderSourceNode : public SourceNode {
+ RecordBatchReaderSourceNode(ExecPlan* plan, std::shared_ptr<Schema> schema,
+ arrow::AsyncGenerator<std::optional<ExecBatch>>
generator)
+ : SourceNode(plan, schema, generator) {}
+
+ static Result<ExecNode*> Make(ExecPlan* plan, std::vector<ExecNode*> inputs,
+ const ExecNodeOptions& options) {
+ RETURN_NOT_OK(ValidateExecNodeInputs(plan, inputs, 0, kKindName));
+ const auto& cast_options =
+ checked_cast<const RecordBatchReaderSourceNodeOptions&>(options);
+ auto& reader = cast_options.reader;
+ auto io_executor = cast_options.io_executor;
+
+ if (reader == NULLPTR) {
+ return Status::Invalid(kKindName, " requires a reader which is not
null");
+ }
+
+ if (io_executor == NULLPTR) {
+ io_executor = io::internal::GetIOThreadPool();
+ }
+
+ ARROW_ASSIGN_OR_RAISE(auto generator, MakeGenerator(reader, io_executor));
+ return plan->EmplaceNode<RecordBatchReaderSourceNode>(plan,
reader->schema(),
+ generator);
+ }
+
+ static Result<arrow::AsyncGenerator<std::optional<ExecBatch>>> MakeGenerator(
+ const std::shared_ptr<RecordBatchReader>& reader,
+ arrow::internal::Executor* io_executor) {
+ const auto& schema = reader->schema();
+ auto to_exec_batch =
+ [schema](const std::shared_ptr<RecordBatch>& batch) ->
std::optional<ExecBatch> {
+ if (batch == NULLPTR || *batch->schema() != *schema) {
Review Comment:
```suggestion
if (batch == nullptr) {
```
I don't believe we should need to make sure the batches produced by the
record batch reader have the correct schema.
##########
cpp/src/arrow/compute/exec/source_node.cc:
##########
@@ -327,6 +327,53 @@ struct SchemaSourceNode : public SourceNode {
}
};
+struct RecordBatchReaderSourceNode : public SourceNode {
+ RecordBatchReaderSourceNode(ExecPlan* plan, std::shared_ptr<Schema> schema,
+ arrow::AsyncGenerator<std::optional<ExecBatch>>
generator)
+ : SourceNode(plan, schema, generator) {}
+
+ static Result<ExecNode*> Make(ExecPlan* plan, std::vector<ExecNode*> inputs,
+ const ExecNodeOptions& options) {
+ RETURN_NOT_OK(ValidateExecNodeInputs(plan, inputs, 0, kKindName));
+ const auto& cast_options =
+ checked_cast<const RecordBatchReaderSourceNodeOptions&>(options);
+ auto& reader = cast_options.reader;
+ auto io_executor = cast_options.io_executor;
+
+ if (reader == NULLPTR) {
+ return Status::Invalid(kKindName, " requires a reader which is not
null");
+ }
+
+ if (io_executor == NULLPTR) {
Review Comment:
```suggestion
if (io_executor == nullptr) {
```
##########
cpp/src/arrow/compute/exec/source_node.cc:
##########
@@ -327,6 +327,53 @@ struct SchemaSourceNode : public SourceNode {
}
};
+struct RecordBatchReaderSourceNode : public SourceNode {
+ RecordBatchReaderSourceNode(ExecPlan* plan, std::shared_ptr<Schema> schema,
+ arrow::AsyncGenerator<std::optional<ExecBatch>>
generator)
+ : SourceNode(plan, schema, generator) {}
+
+ static Result<ExecNode*> Make(ExecPlan* plan, std::vector<ExecNode*> inputs,
+ const ExecNodeOptions& options) {
+ RETURN_NOT_OK(ValidateExecNodeInputs(plan, inputs, 0, kKindName));
+ const auto& cast_options =
+ checked_cast<const RecordBatchReaderSourceNodeOptions&>(options);
+ auto& reader = cast_options.reader;
+ auto io_executor = cast_options.io_executor;
+
+ if (reader == NULLPTR) {
Review Comment:
```suggestion
if (reader == nullptr) {
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]