JingsongLi commented on code in PR #230:
URL: https://github.com/apache/paimon-rust/pull/230#discussion_r3061706626
##########
crates/paimon/src/arrow/format/parquet.rs:
##########
@@ -825,6 +950,84 @@ impl AsyncFileReader for ArrowFileReader {
}
}
+// ---------------------------------------------------------------------------
+// Range coalescing
+// ---------------------------------------------------------------------------
+
+/// Merge nearby byte ranges to reduce the number of requests.
+///
+/// Ranges whose gap is ≤ `coalesce` bytes are merged into a single range.
+/// The input does not need to be sorted.
+fn merge_byte_ranges(ranges: &[Range<u64>], coalesce: u64) -> Vec<Range<u64>> {
+ if ranges.is_empty() {
+ return vec![];
+ }
+
+ let mut sorted = ranges.to_vec();
+ sorted.sort_unstable_by_key(|r| r.start);
+
+ let mut merged = Vec::with_capacity(sorted.len());
+ let mut start_idx = 0;
+ let mut end_idx = 1;
+
+ while start_idx != sorted.len() {
+ let mut range_end = sorted[start_idx].end;
+
+ while end_idx != sorted.len()
+ && sorted[end_idx]
+ .start
+ .checked_sub(range_end)
+ .map(|delta| delta <= coalesce)
+ .unwrap_or(true)
+ {
+ range_end = range_end.max(sorted[end_idx].end);
+ end_idx += 1;
+ }
+
+ merged.push(sorted[start_idx].start..range_end);
+ start_idx = end_idx;
+ end_idx += 1;
+ }
+
+ merged
+}
+
+/// Split merged ranges into fixed-size batches to utilize concurrency,
+/// Each merged range is divided into chunks of `expected_size`,
+/// with the last chunk taking whatever remains.
+/// Ranges smaller than `2 * MIN_SPLIT_SIZE` are kept as-is to
+/// avoid excessive small IO requests.
+fn split_ranges_for_concurrency(merged: Vec<Range<u64>>, target_count: usize)
-> Vec<Range<u64>> {
+ if merged.is_empty() || target_count <= 1 {
+ return merged;
+ }
+
+ let mut result = Vec::with_capacity(merged.len());
+
+ for range in &merged {
+ let length = range.end - range.start;
+ let expected_size = MIN_SPLIT_SIZE.max(length / target_count as u64 +
1);
Review Comment:
MIN_SPLIT_SIZE.max(length.div_ceil(target_count as u64)) ?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]