Re: [PR] perf: coalesce batches before sending to distributor channels in RepartitionExec [datafusion]

via GitHub Wed, 13 May 2026 13:01:02 -0700


gabotechs commented on code in PR #22010:
URL: https://github.com/apache/datafusion/pull/22010#discussion_r3237098214



##########
datafusion/physical-plan/src/repartition/mod.rs:
##########
@@ -143,11 +144,183 @@ type MaybeBatch = Option<Result<RepartitionBatch>>;
 type InputPartitionsToCurrentPartitionSender = 
Vec<DistributionSender<MaybeBatch>>;
 type InputPartitionsToCurrentPartitionReceiver = 
Vec<DistributionReceiver<MaybeBatch>>;
 
-/// Output channel with its associated memory reservation and spill writer
+/// One input task's collection of output channels (its send-side view of
+/// every output partition). Owns the per-call helpers for coalescing,
+/// finalizing, and sending.
+///
+/// This complements [`PartitionChannels`] (the per-output-partition,
+/// authoritative struct that owns `rx`, `spill_readers`, and the underlying
+/// `Mutex<LimitedBatchCoalescer>` / `AtomicUsize`). Each [`OutputChannel`]
+/// inside `inner` holds cloned `Arc`s pointing back at those shared
+/// resources.
+struct OutputChannels {
+    inner: HashMap<usize, OutputChannel>,
+    metrics: RepartitionMetrics,
+}
+
+impl OutputChannels {
+    fn new(inner: HashMap<usize, OutputChannel>, metrics: RepartitionMetrics) 
-> Self {
+        Self { inner, metrics }
+    }
+
+    fn is_empty(&self) -> bool {
+        self.inner.is_empty()
+    }
+
+    fn metrics(&self) -> &RepartitionMetrics {
+        &self.metrics
+    }
+
+    /// Push `batch` for `partition` through its shared coalescer (if any)
+    /// and ship any newly completed batches through the channel.
+    async fn coalesce_and_send(
+        &mut self,
+        partition: usize,
+        batch: RecordBatch,
+    ) -> Result<()> {
+        let Some(channel) = self.inner.get(&partition) else {
+            return Ok(());
+        };
+        let to_send = match &channel.shared_coalescer {
+            Some(shared) => shared.push_and_drain(batch)?,
+            None => vec![batch],
+        };
+        for batch in to_send {
+            self.send_to_channel(partition, batch).await;
+        }
+        Ok(())
+    }
+
+    /// For each output partition this task still has, decrement the shared
+    /// active-senders counter. The last task to do so calls
+    /// [`SharedCoalescer::finalize`] and ships the residual.
+    async fn finalize(&mut self) -> Result<()> {
+        let partitions: Vec<usize> = self.inner.keys().copied().collect();
+        for partition in partitions {
+            let Some(channel) = self.inner.get(&partition) else {
+                continue;
+            };
+            let Some(shared) = channel.shared_coalescer.clone() else {
+                continue;
+            };
+            for batch in shared.finalize()? {
+                self.send_to_channel(partition, batch).await;
+            }
+        }
+        Ok(())
+    }
+
+    /// Send a single batch through the channel for `partition`, applying
+    /// the memory reservation / spill-writer fallback. Removes the channel
+    /// from `self.inner` if the receiver has hung up.
+    async fn send_to_channel(&mut self, partition: usize, batch: RecordBatch) {
+        let size = batch.get_array_memory_size();
+        let timer = self.metrics.send_time[partition].timer();
+
+        // Decide the payload outside of any await: never hold a MutexGuard
+        // across an await point.
+        let (payload, is_memory_batch) = {
+            let Some(channel) = self.inner.get(&partition) else {
+                timer.done();
+                return;
+            };
+            match channel.reservation.try_grow(size) {
+                Ok(_) => (Ok(RepartitionBatch::Memory(batch)), true),
+                Err(_) => match channel.spill_writer.push_batch(&batch) {
+                    Ok(()) => (Ok(RepartitionBatch::Spilled), false),
+                    Err(err) => (Err(err), false),
+                },
+            }
+        };
+
+        let Some(channel) = self.inner.get(&partition) else {
+            timer.done();
+            return;
+        };
+        let send_err = channel.sender.send(Some(payload)).await.is_err();
+        if send_err {
+            if is_memory_batch && let Some(channel) = 
self.inner.get(&partition) {
+                channel.reservation.shrink(size);
+            }
+            self.inner.remove(&partition);
+        }
+        timer.done();
+    }
+}
+
+/// A producer-side coalescer shared across all input tasks targeting a
+/// single output partition.
+///
+/// Bundles the [`LimitedBatchCoalescer`] (behind a [`Mutex`]) with the
+/// active-sender counter that tracks how many input tasks may still push
+/// into it. The last task to call [`Self::finalize`] is the one that
+/// finalizes the coalescer and ships the residual batch.
+///
+/// Cheap to [`Clone`]: both fields are [`Arc`]s.
+#[derive(Clone)]
+struct SharedCoalescer {
+    inner: Arc<Mutex<LimitedBatchCoalescer>>,
+    active_senders: Arc<AtomicUsize>,
+}
+
+impl SharedCoalescer {
+    fn new(
+        schema: SchemaRef,
+        target_batch_size: usize,
+        fetch: Option<usize>,
+        num_senders: usize,
+    ) -> Self {
+        Self {
+            inner: Arc::new(Mutex::new(LimitedBatchCoalescer::new(
+                schema,
+                target_batch_size,
+                fetch,
+            ))),
+            active_senders: Arc::new(AtomicUsize::new(num_senders)),
+        }
+    }
+
+    /// Push `batch` into the coalescer and drain any newly completed
+    /// batches. The mutex is held only briefly.
+    fn push_and_drain(&self, batch: RecordBatch) -> Result<Vec<RecordBatch>> {
+        let mut acc = Vec::new();
+        let mut c = self.inner.lock();
+        c.push_batch(batch)?;

Review Comment:
   :thinking: in your link, it doesn't have it because I removed it from where 
it was, in favor of having it here, but if you look at `main`, there do is some 
batch coalescing happening in the `PerPartitionStream` struct:
   
   
https://github.com/apache/datafusion/blob/main/datafusion/physical-plan/src/repartition/mod.rs#L1665
   
   And what we have in `main` today does implement a limit.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] perf: coalesce batches before sending to distributor channels in RepartitionExec [datafusion]

Reply via email to