XiaoHongbo-Hope commented on code in PR #6944:
URL: https://github.com/apache/paimon/pull/6944#discussion_r2659291867
##########
paimon-python/pypaimon/read/scanner/full_starting_scanner.py:
##########
@@ -125,33 +127,16 @@ def with_shard(self, idx_of_this_subtask,
number_of_para_subtasks) -> 'FullStart
self.number_of_para_subtasks = number_of_para_subtasks
return self
- def _append_only_filter_by_shard(self, partitioned_files: defaultdict) ->
(defaultdict, int, int):
- """
- Filter file entries by shard. Only keep the files within the range,
which means
- that only the starting and ending files need to be further divided
subsequently
- """
- total_row = 0
- # Sort by file creation time to ensure consistent sharding
- for key, file_entries in partitioned_files.items():
- for entry in file_entries:
- total_row += entry.file.row_count
-
- # Calculate number of rows this shard should process using balanced
distribution
- # Distribute remainder evenly among first few shards to avoid last
shard overload
- base_rows_per_shard = total_row // self.number_of_para_subtasks
- remainder = total_row % self.number_of_para_subtasks
-
- # Each of the first 'remainder' shards gets one extra row
- if self.idx_of_this_subtask < remainder:
- num_row = base_rows_per_shard + 1
- start_row = self.idx_of_this_subtask * (base_rows_per_shard + 1)
- else:
- num_row = base_rows_per_shard
- start_row = (remainder * (base_rows_per_shard + 1) +
- (self.idx_of_this_subtask - remainder) *
base_rows_per_shard)
-
- end_row = start_row + num_row
+ def with_row_range(self, start_row, end_row) -> 'FullStartingScanner':
Review Comment:
Can we use with_row_range and with_shard at the same time or not?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]