Re: [PR] perf: coalesce batches before sending to distributor channels in RepartitionExec [datafusion]

via GitHub Fri, 15 May 2026 06:18:18 -0700


alamb commented on code in PR #22010:
URL: https://github.com/apache/datafusion/pull/22010#discussion_r3248382963



##########
datafusion/physical-plan/src/repartition/mod.rs:
##########
@@ -143,11 +153,120 @@ type MaybeBatch = Option<Result<RepartitionBatch>>;
 type InputPartitionsToCurrentPartitionSender = 
Vec<DistributionSender<MaybeBatch>>;
 type InputPartitionsToCurrentPartitionReceiver = 
Vec<DistributionReceiver<MaybeBatch>>;
 
-/// Output channel with its associated memory reservation and spill writer
+/// Output channel with its associated memory reservation and spill writer.
+///
+/// `coalescer` is `None` for preserve-order mode, where downstream
+/// [`StreamingMergeBuilder`] performs the batching; otherwise it's a
+/// [`SharedCoalescer`] cloned from the per-partition one held by
+/// [`PartitionChannels`].
 struct OutputChannel {
     sender: DistributionSender<MaybeBatch>,
     reservation: SharedMemoryReservation,
     spill_writer: SpillPoolWriter,
+    shared_coalescer: Option<SharedCoalescer>,
+}
+
+impl OutputChannel {
+    fn coalesce(&mut self, batch: RecordBatch) -> Result<Vec<RecordBatch>> {
+        match &self.shared_coalescer {
+            Some(shared) => Ok(shared.push_and_drain(batch)?),
+            None => Ok(vec![batch]),
+        }
+    }
+
+    /// Send a single batch through the channel for `partition`, applying

Review Comment:
   Maybe worth pointing out in the comment that this is used *after* coalescing 
(and doesn't internally coalesce)



##########
datafusion/physical-plan/src/repartition/mod.rs:
##########
@@ -797,6 +934,34 @@ impl BatchPartitioner {
 /// arbitrary interleaving (and thus unordered) unless
 /// [`Self::with_preserve_order`] specifies otherwise.
 ///
+/// # Batch coalescing
+///
+/// Repartitioning one [`RecordBatch`] implies creating multiple smaller 
batches, potentially
+/// as many as the number of output partitions. [`RepartitionExec`] makes sure 
that the returned
+/// batches adhere to the configured `datafusion.execution.batch_size` for 
efficient operations,
+/// and for that, it will automatically coalesce batches right after 
repartitioning.
+///
+/// For this, one shared [`LimitedBatchCoalescer`] per output partition is 
used:

Review Comment:
   love it 😍 



##########
datafusion/physical-plan/src/repartition/mod.rs:
##########
@@ -143,11 +144,183 @@ type MaybeBatch = Option<Result<RepartitionBatch>>;
 type InputPartitionsToCurrentPartitionSender = 
Vec<DistributionSender<MaybeBatch>>;
 type InputPartitionsToCurrentPartitionReceiver = 
Vec<DistributionReceiver<MaybeBatch>>;
 
-/// Output channel with its associated memory reservation and spill writer
+/// One input task's collection of output channels (its send-side view of
+/// every output partition). Owns the per-call helpers for coalescing,
+/// finalizing, and sending.
+///
+/// This complements [`PartitionChannels`] (the per-output-partition,
+/// authoritative struct that owns `rx`, `spill_readers`, and the underlying
+/// `Mutex<LimitedBatchCoalescer>` / `AtomicUsize`). Each [`OutputChannel`]
+/// inside `inner` holds cloned `Arc`s pointing back at those shared
+/// resources.
+struct OutputChannels {
+    inner: HashMap<usize, OutputChannel>,
+    metrics: RepartitionMetrics,
+}
+
+impl OutputChannels {
+    fn new(inner: HashMap<usize, OutputChannel>, metrics: RepartitionMetrics) 
-> Self {
+        Self { inner, metrics }
+    }
+
+    fn is_empty(&self) -> bool {
+        self.inner.is_empty()
+    }
+
+    fn metrics(&self) -> &RepartitionMetrics {
+        &self.metrics
+    }
+
+    /// Push `batch` for `partition` through its shared coalescer (if any)
+    /// and ship any newly completed batches through the channel.
+    async fn coalesce_and_send(
+        &mut self,
+        partition: usize,
+        batch: RecordBatch,
+    ) -> Result<()> {
+        let Some(channel) = self.inner.get(&partition) else {
+            return Ok(());
+        };
+        let to_send = match &channel.shared_coalescer {
+            Some(shared) => shared.push_and_drain(batch)?,
+            None => vec![batch],
+        };
+        for batch in to_send {
+            self.send_to_channel(partition, batch).await;
+        }
+        Ok(())
+    }
+
+    /// For each output partition this task still has, decrement the shared
+    /// active-senders counter. The last task to do so calls
+    /// [`SharedCoalescer::finalize`] and ships the residual.
+    async fn finalize(&mut self) -> Result<()> {
+        let partitions: Vec<usize> = self.inner.keys().copied().collect();
+        for partition in partitions {
+            let Some(channel) = self.inner.get(&partition) else {
+                continue;
+            };
+            let Some(shared) = channel.shared_coalescer.clone() else {
+                continue;
+            };
+            for batch in shared.finalize()? {
+                self.send_to_channel(partition, batch).await;
+            }
+        }
+        Ok(())
+    }
+
+    /// Send a single batch through the channel for `partition`, applying
+    /// the memory reservation / spill-writer fallback. Removes the channel
+    /// from `self.inner` if the receiver has hung up.
+    async fn send_to_channel(&mut self, partition: usize, batch: RecordBatch) {
+        let size = batch.get_array_memory_size();
+        let timer = self.metrics.send_time[partition].timer();
+
+        // Decide the payload outside of any await: never hold a MutexGuard
+        // across an await point.
+        let (payload, is_memory_batch) = {
+            let Some(channel) = self.inner.get(&partition) else {
+                timer.done();
+                return;
+            };
+            match channel.reservation.try_grow(size) {
+                Ok(_) => (Ok(RepartitionBatch::Memory(batch)), true),
+                Err(_) => match channel.spill_writer.push_batch(&batch) {
+                    Ok(()) => (Ok(RepartitionBatch::Spilled), false),
+                    Err(err) => (Err(err), false),
+                },
+            }
+        };
+
+        let Some(channel) = self.inner.get(&partition) else {
+            timer.done();
+            return;
+        };
+        let send_err = channel.sender.send(Some(payload)).await.is_err();
+        if send_err {
+            if is_memory_batch && let Some(channel) = 
self.inner.get(&partition) {
+                channel.reservation.shrink(size);
+            }
+            self.inner.remove(&partition);
+        }
+        timer.done();
+    }
+}
+
+/// A producer-side coalescer shared across all input tasks targeting a
+/// single output partition.
+///
+/// Bundles the [`LimitedBatchCoalescer`] (behind a [`Mutex`]) with the
+/// active-sender counter that tracks how many input tasks may still push
+/// into it. The last task to call [`Self::finalize`] is the one that
+/// finalizes the coalescer and ships the residual batch.
+///
+/// Cheap to [`Clone`]: both fields are [`Arc`]s.
+#[derive(Clone)]
+struct SharedCoalescer {
+    inner: Arc<Mutex<LimitedBatchCoalescer>>,
+    active_senders: Arc<AtomicUsize>,
+}
+
+impl SharedCoalescer {
+    fn new(
+        schema: SchemaRef,
+        target_batch_size: usize,
+        fetch: Option<usize>,
+        num_senders: usize,
+    ) -> Self {
+        Self {
+            inner: Arc::new(Mutex::new(LimitedBatchCoalescer::new(
+                schema,
+                target_batch_size,
+                fetch,
+            ))),
+            active_senders: Arc::new(AtomicUsize::new(num_senders)),
+        }
+    }
+
+    /// Push `batch` into the coalescer and drain any newly completed
+    /// batches. The mutex is held only briefly.
+    fn push_and_drain(&self, batch: RecordBatch) -> Result<Vec<RecordBatch>> {
+        let mut acc = Vec::new();
+        let mut c = self.inner.lock();

Review Comment:
   Yeah I think it is a good idea to go with this approach and maybe file 
aticket describing the potential bottleneck and work it as a follow on
   
   Thinking about this more after some sleep -- I am still trying to figure out 
what scenarios contenting on this Mutex could cause issues with. I think it 
could be bad if there is one output partition getting all the output. However, 
I think putting the mutex on the sender side will block (waiting on the mutex) 
the producers
   
   I can't really think of scenarios where this will be a problem (if one 
partition is getting all the data that is going to be bad in any event)



##########
datafusion/physical-plan/src/repartition/mod.rs:
##########
@@ -1473,33 +1636,17 @@ impl RepartitionExec {
 
             for res in partitioner.partition_iter(batch)? {
                 let (partition, batch) = res?;
-                let size = batch.get_array_memory_size();
 
                 let timer = metrics.send_time[partition].timer();
                 // if there is still a receiver, send to it
-                if let Some(channel) = output_channels.get_mut(&partition) {
-                    let (batch_to_send, is_memory_batch) =
-                        match channel.reservation.try_grow(size) {
-                            Ok(_) => {
-                                // Memory available - send in-memory batch
-                                (RepartitionBatch::Memory(batch), true)
-                            }
-                            Err(_) => {
-                                // We're memory limited - spill to SpillPool
-                                // SpillPool handles file handle reuse and 
rotation
-                                channel.spill_writer.push_batch(&batch)?;
-                                // Send marker indicating batch was spilled
-                                (RepartitionBatch::Spilled, false)
-                            }
-                        };
-
-                    if 
channel.sender.send(Some(Ok(batch_to_send))).await.is_err() {
-                        // If the other end has hung up, it was an early 
shutdown (e.g. LIMIT)
-                        // Only shrink memory if it was a memory batch
-                        if is_memory_batch {
-                            channel.reservation.shrink(size);
+                if let Some(output_channel) = 
output_channels.get_mut(&partition) {
+                    for batch in output_channel.coalesce(batch)? {
+                        if output_channel.send(batch).await.is_err() {
+                            // If the other end has hung up, it was an early 
shutdown (e.g. LIMIT)
+                            // Only shrink memory if it was a memory batch

Review Comment:
   >                             // Only shrink memory if it was a memory batch
   
   This comment seems stale as it isn't checking memory 🤔 



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] perf: coalesce batches before sending to distributor channels in RepartitionExec [datafusion]

Reply via email to