lidavidm commented on a change in pull request #10664:
URL: https://github.com/apache/arrow/pull/10664#discussion_r665600038
##########
File path: cpp/src/arrow/dataset/scanner.cc
##########
@@ -506,73 +444,116 @@ Result<EnumeratedRecordBatchGenerator> FragmentToBatches(
return EnumeratedRecordBatch{record_batch, fragment};
};
- auto combined_gen = MakeMappedGenerator(enumerated_batch_gen,
std::move(combine_fn));
-
- if (filter_and_project) {
- return FilterAndProjectRecordBatchAsync(options, std::move(combined_gen));
- }
- return combined_gen;
+ return MakeMappedGenerator(enumerated_batch_gen, std::move(combine_fn));
}
Result<AsyncGenerator<EnumeratedRecordBatchGenerator>> FragmentsToBatches(
- FragmentGenerator fragment_gen, const std::shared_ptr<ScanOptions>&
options,
- bool filter_and_project = true) {
+ FragmentGenerator fragment_gen, const std::shared_ptr<ScanOptions>&
options) {
auto enumerated_fragment_gen =
MakeEnumeratedGenerator(std::move(fragment_gen));
return MakeMappedGenerator(std::move(enumerated_fragment_gen),
[=](const Enumerated<std::shared_ptr<Fragment>>&
fragment) {
- return FragmentToBatches(fragment, options,
- filter_and_project);
+ return FragmentToBatches(fragment, options);
});
}
-Result<AsyncGenerator<AsyncGenerator<util::optional<int64_t>>>>
FragmentsToRowCount(
- FragmentGenerator fragment_gen,
- std::shared_ptr<ScanOptions> options_with_projection) {
- // Must use optional<int64_t> to avoid breaking the pipeline on empty batches
- auto enumerated_fragment_gen =
MakeEnumeratedGenerator(std::move(fragment_gen));
+Result<compute::ExecNode*> MakeScanNode(compute::ExecPlan* plan,
+ FragmentGenerator fragment_gen,
+ std::shared_ptr<ScanOptions> options) {
+ if (!options->use_async) {
+ return Status::NotImplemented("ScanNodes without asynchrony");
+ }
- // Drop projection since we only need to count rows
- auto options = std::make_shared<ScanOptions>(*options_with_projection);
- RETURN_NOT_OK(SetProjection(options.get(), std::vector<std::string>()));
Review comment:
@bkietz the reason why CountRows is slower is because this is missing
from the new implementation :joy: I'm testing using the NYC Taxi dataset,
hence, this is the difference between loading one integer column and loading
several gigabytes of data.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]