JingsongLi commented on code in PR #6944:
URL: https://github.com/apache/paimon/pull/6944#discussion_r2659282556


##########
paimon-python/pypaimon/read/scanner/full_starting_scanner.py:
##########
@@ -125,33 +127,16 @@ def with_shard(self, idx_of_this_subtask, 
number_of_para_subtasks) -> 'FullStart
         self.number_of_para_subtasks = number_of_para_subtasks
         return self
 
-    def _append_only_filter_by_shard(self, partitioned_files: defaultdict) -> 
(defaultdict, int, int):
-        """
-        Filter file entries by shard. Only keep the files within the range, 
which means
-        that only the starting and ending files need to be further divided 
subsequently
-        """
-        total_row = 0
-        # Sort by file creation time to ensure consistent sharding
-        for key, file_entries in partitioned_files.items():
-            for entry in file_entries:
-                total_row += entry.file.row_count
-
-        # Calculate number of rows this shard should process using balanced 
distribution
-        # Distribute remainder evenly among first few shards to avoid last 
shard overload
-        base_rows_per_shard = total_row // self.number_of_para_subtasks
-        remainder = total_row % self.number_of_para_subtasks
-
-        # Each of the first 'remainder' shards gets one extra row
-        if self.idx_of_this_subtask < remainder:
-            num_row = base_rows_per_shard + 1
-            start_row = self.idx_of_this_subtask * (base_rows_per_shard + 1)
-        else:
-            num_row = base_rows_per_shard
-            start_row = (remainder * (base_rows_per_shard + 1) +
-                         (self.idx_of_this_subtask - remainder) * 
base_rows_per_shard)
-
-        end_row = start_row + num_row
+    def with_row_range(self, start_row, end_row) -> 'FullStartingScanner':

Review Comment:
   Can we raise a exception for primary key table? 



##########
paimon-python/pypaimon/read/table_scan.py:
##########
@@ -71,3 +71,7 @@ def _create_starting_scanner(self) -> 
Optional[StartingScanner]:
     def with_shard(self, idx_of_this_subtask, number_of_para_subtasks) -> 
'TableScan':
         self.starting_scanner.with_shard(idx_of_this_subtask, 
number_of_para_subtasks)
         return self
+
+    def with_row_range(self, start_row, end_row) -> 'TableScan':

Review Comment:
   Are we returning this exact number of lines, or can it be approximate? This 
needs to be specified clearly in the comments.
   



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to