alamb commented on code in PR #12174:
URL: https://github.com/apache/datafusion/pull/12174#discussion_r1734614514


##########
datafusion/physical-plan/src/sorts/sort.rs:
##########
@@ -933,24 +965,59 @@ impl ExecutionPlan for SortExec {
     }
 
     fn with_fetch(&self, limit: Option<usize>) -> Option<Arc<dyn 
ExecutionPlan>> {
-        Some(Arc::new(SortExec {
-            input: Arc::clone(&self.input),
-            expr: self.expr.clone(),
-            metrics_set: self.metrics_set.clone(),
-            preserve_partitioning: self.preserve_partitioning,
-            fetch: limit,
-            cache: self.cache.clone(),
-        }))
+        Some(Arc::new(SortExec::with_fetch(self, limit)))
     }
 
     fn fetch(&self) -> Option<usize> {
         self.fetch
     }
 }
 
+struct TopKStream {
+    input: SendableRecordBatchStream,
+    schema: SchemaRef,
+    fetch: usize,
+}
+
+impl Stream for TopKStream {

Review Comment:
   This looks very similar to `LimitStream` -- 
https://docs.rs/datafusion-physical-plan/41.0.0/src/datafusion_physical_plan/limit.rs.html#434
   
   though limit stream has metrics and some other features
   



##########
datafusion/physical-plan/src/sorts/sort.rs:
##########
@@ -874,53 +892,67 @@ impl ExecutionPlan for SortExec {
 
         trace!("End SortExec's input.execute for partition: {}", partition);
 
-        if let Some(fetch) = self.fetch.as_ref() {
-            let mut topk = TopK::try_new(
-                partition,
-                input.schema(),
-                self.expr.clone(),
-                *fetch,
-                context.session_config().batch_size(),
-                context.runtime_env(),
-                &self.metrics_set,
-                partition,
-            )?;
-
-            Ok(Box::pin(RecordBatchStreamAdapter::new(
-                self.schema(),
-                futures::stream::once(async move {
-                    while let Some(batch) = input.next().await {
-                        let batch = batch?;
-                        topk.insert_batch(batch)?;
-                    }
-                    topk.emit()
-                })
-                .try_flatten(),
-            )))
-        } else {
-            let mut sorter = ExternalSorter::new(
-                partition,
-                input.schema(),
-                self.expr.clone(),
-                context.session_config().batch_size(),
-                self.fetch,
-                execution_options.sort_spill_reservation_bytes,
-                execution_options.sort_in_place_threshold_bytes,
-                &self.metrics_set,
-                context.runtime_env(),
+        let sort_satisfied = self

Review Comment:
   This is the same calculation as `self.execution_mode()`, right? Maybe we 
could call `self.execution_mode` here instead to be more efficient and ensure 
the calculations remained in sync



##########
datafusion/physical-plan/src/sorts/sort.rs:
##########
@@ -933,24 +965,59 @@ impl ExecutionPlan for SortExec {
     }
 
     fn with_fetch(&self, limit: Option<usize>) -> Option<Arc<dyn 
ExecutionPlan>> {
-        Some(Arc::new(SortExec {
-            input: Arc::clone(&self.input),
-            expr: self.expr.clone(),
-            metrics_set: self.metrics_set.clone(),
-            preserve_partitioning: self.preserve_partitioning,
-            fetch: limit,
-            cache: self.cache.clone(),
-        }))
+        Some(Arc::new(SortExec::with_fetch(self, limit)))
     }
 
     fn fetch(&self) -> Option<usize> {
         self.fetch
     }
 }
 
+struct TopKStream {

Review Comment:
   I think it would help to add documentation to this struct, specifically that 
explains how it is different than `TopK`



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to