comphead commented on code in PR #3076:
URL: https://github.com/apache/datafusion-comet/pull/3076#discussion_r2695970900
##########
native/core/src/execution/shuffle/shuffle_writer.rs:
##########
@@ -598,6 +601,67 @@ impl MultiPartitionShuffleRepartitioner {
.await?;
self.scratch = scratch;
}
+ CometPartitioning::RoundRobin(num_output_partitions,
max_hash_columns) => {
+ // Comet implements "round robin" as hash partitioning on
columns.
+ // This achieves the same goal as Spark's round robin (even
distribution
+ // without semantic grouping) while being deterministic for
fault tolerance.
+ //
+ // Note: This produces different partition assignments than
Spark's round robin,
+ // which sorts by UnsafeRow binary representation before
assigning partitions.
+ // However, both approaches provide even distribution and
determinism.
+ let mut scratch = std::mem::take(&mut self.scratch);
+ let (partition_starts, partition_row_indices): (&Vec<u32>,
&Vec<u32>) = {
+ let mut timer = self.metrics.repart_time.timer();
+
+ let num_rows = input.num_rows();
+
+ // Collect columns for hashing, respecting
max_hash_columns limit
+ // max_hash_columns of 0 means no limit (hash all columns)
+ let num_columns_to_hash = if *max_hash_columns == 0 {
+ input.num_columns()
+ } else {
+ (*max_hash_columns).min(input.num_columns())
Review Comment:
we need to treat properly negative values
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]