alamb commented on code in PR #12174:
URL: https://github.com/apache/datafusion/pull/12174#discussion_r1734614514
##########
datafusion/physical-plan/src/sorts/sort.rs:
##########
@@ -933,24 +965,59 @@ impl ExecutionPlan for SortExec {
}
fn with_fetch(&self, limit: Option<usize>) -> Option<Arc<dyn
ExecutionPlan>> {
- Some(Arc::new(SortExec {
- input: Arc::clone(&self.input),
- expr: self.expr.clone(),
- metrics_set: self.metrics_set.clone(),
- preserve_partitioning: self.preserve_partitioning,
- fetch: limit,
- cache: self.cache.clone(),
- }))
+ Some(Arc::new(SortExec::with_fetch(self, limit)))
}
fn fetch(&self) -> Option<usize> {
self.fetch
}
}
+struct TopKStream {
+ input: SendableRecordBatchStream,
+ schema: SchemaRef,
+ fetch: usize,
+}
+
+impl Stream for TopKStream {
Review Comment:
This looks very similar to `LimitStream` --
https://docs.rs/datafusion-physical-plan/41.0.0/src/datafusion_physical_plan/limit.rs.html#434
though limit stream has metrics and some other features
##########
datafusion/physical-plan/src/sorts/sort.rs:
##########
@@ -874,53 +892,67 @@ impl ExecutionPlan for SortExec {
trace!("End SortExec's input.execute for partition: {}", partition);
- if let Some(fetch) = self.fetch.as_ref() {
- let mut topk = TopK::try_new(
- partition,
- input.schema(),
- self.expr.clone(),
- *fetch,
- context.session_config().batch_size(),
- context.runtime_env(),
- &self.metrics_set,
- partition,
- )?;
-
- Ok(Box::pin(RecordBatchStreamAdapter::new(
- self.schema(),
- futures::stream::once(async move {
- while let Some(batch) = input.next().await {
- let batch = batch?;
- topk.insert_batch(batch)?;
- }
- topk.emit()
- })
- .try_flatten(),
- )))
- } else {
- let mut sorter = ExternalSorter::new(
- partition,
- input.schema(),
- self.expr.clone(),
- context.session_config().batch_size(),
- self.fetch,
- execution_options.sort_spill_reservation_bytes,
- execution_options.sort_in_place_threshold_bytes,
- &self.metrics_set,
- context.runtime_env(),
+ let sort_satisfied = self
Review Comment:
This is the same calculation as `self.execution_mode()`, right? Maybe we
could call `self.execution_mode` here instead to be more efficient and ensure
the calculations remained in sync
##########
datafusion/physical-plan/src/sorts/sort.rs:
##########
@@ -933,24 +965,59 @@ impl ExecutionPlan for SortExec {
}
fn with_fetch(&self, limit: Option<usize>) -> Option<Arc<dyn
ExecutionPlan>> {
- Some(Arc::new(SortExec {
- input: Arc::clone(&self.input),
- expr: self.expr.clone(),
- metrics_set: self.metrics_set.clone(),
- preserve_partitioning: self.preserve_partitioning,
- fetch: limit,
- cache: self.cache.clone(),
- }))
+ Some(Arc::new(SortExec::with_fetch(self, limit)))
}
fn fetch(&self) -> Option<usize> {
self.fetch
}
}
+struct TopKStream {
Review Comment:
I think it would help to add documentation to this struct, specifically that
explains how it is different than `TopK`
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]