2010YOUY01 commented on code in PR #13540:
URL: https://github.com/apache/datafusion/pull/13540#discussion_r1858534991
##########
datafusion/physical-plan/src/memory.rs:
##########
@@ -365,8 +365,165 @@ impl RecordBatchStream for MemoryStream {
}
}
+pub trait StreamingBatchGenerator: Send + Sync + fmt::Debug + fmt::Display {
+ /// Generate the next batch, return `None` when no more batches are
available
+ fn generate_next_batch(&mut self) -> Result<Option<RecordBatch>>;
+
+ /// Creates a boxed clone of this generator.
+ ///
+ /// This method is required because `Clone` cannot be directly implemented
for
+ /// trait objects. It provides a way to clone trait objects of
+ /// StreamingBatchGenerator while maintaining proper type erasure.
+ fn clone_box(&self) -> Box<dyn StreamingBatchGenerator>;
+}
+
+/// Execution plan for streaming in-memory batches of data
+///
+/// This plan generates output batches lazily, it doesn't have to buffer all
batches
+/// in memory up front (compared to `MemoryExec`), thus consuming constant
memory.
+pub struct StreamingMemoryExec {
+ /// Schema representing the data
+ schema: SchemaRef,
+ /// Functions to generate batches for each partition
+ batch_generators: Vec<Box<dyn StreamingBatchGenerator>>,
+ /// Total number of rows to generate for statistics
+ cache: PlanProperties,
+}
+
+impl StreamingMemoryExec {
+ /// Create a new streaming memory execution plan
+ pub fn try_new(
+ schema: SchemaRef,
+ generators: Vec<Box<dyn StreamingBatchGenerator>>,
+ ) -> Result<Self> {
+ let cache = PlanProperties::new(
+ EquivalenceProperties::new(Arc::clone(&schema)),
+ Partitioning::RoundRobinBatch(generators.len()),
+ ExecutionMode::Bounded,
+ );
+ Ok(Self {
+ schema,
+ batch_generators: generators,
+ cache,
+ })
+ }
+}
+
+impl fmt::Debug for StreamingMemoryExec {
+ fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+ f.debug_struct("StreamingMemoryExec")
+ .field("schema", &self.schema)
+ .field("batch_generators", &self.batch_generators)
+ .finish()
+ }
+}
+
+impl DisplayAs for StreamingMemoryExec {
+ fn fmt_as(&self, t: DisplayFormatType, f: &mut fmt::Formatter) ->
fmt::Result {
+ match t {
+ DisplayFormatType::Default | DisplayFormatType::Verbose => {
+ write!(
+ f,
+ "StreamingMemoryExec: partitions={},
batch_generators=[{}]",
+ self.batch_generators.len(),
+ self.batch_generators
+ .iter()
+ .map(|g| g.to_string())
+ .collect::<Vec<_>>()
+ .join(", ")
+ )
+ }
+ }
+ }
+}
+
+impl ExecutionPlan for StreamingMemoryExec {
+ fn name(&self) -> &'static str {
+ "StreamingMemoryExec"
+ }
+
+ fn as_any(&self) -> &dyn Any {
+ self
+ }
+
+ fn schema(&self) -> SchemaRef {
+ Arc::clone(&self.schema)
+ }
+
+ fn properties(&self) -> &PlanProperties {
+ &self.cache
+ }
+
+ fn children(&self) -> Vec<&Arc<dyn ExecutionPlan>> {
+ vec![]
+ }
+
+ fn with_new_children(
+ self: Arc<Self>,
+ children: Vec<Arc<dyn ExecutionPlan>>,
+ ) -> Result<Arc<dyn ExecutionPlan>> {
+ if children.is_empty() {
+ Ok(self)
+ } else {
+ internal_err!("Children cannot be replaced in StreamingMemoryExec")
+ }
+ }
+
+ fn execute(
+ &self,
+ partition: usize,
+ _context: Arc<TaskContext>,
+ ) -> Result<SendableRecordBatchStream> {
+ if partition >= self.batch_generators.len() {
+ return internal_err!(
+ "Invalid partition {} for StreamingMemoryExec with {}
partitions",
+ partition,
+ self.batch_generators.len()
+ );
+ }
+
+ Ok(Box::pin(StreamingMemoryStream {
+ schema: Arc::clone(&self.schema),
+ generator: self.batch_generators[partition].clone_box(),
+ }))
+ }
+
+ fn statistics(&self) -> Result<Statistics> {
+ Ok(Statistics::new_unknown(&self.schema))
+ }
+}
+
+/// Stream that generates record batches on demand
+pub struct StreamingMemoryStream {
+ schema: SchemaRef,
+ generator: Box<dyn StreamingBatchGenerator>,
Review Comment:
Good point, `Arc` is more flexible: implementation can choose to let a
`StreamingBatchGenerate` share between multiple streams, or create separate
generators for each stream
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]