XiaoHongbo-Hope commented on code in PR #7014:
URL: https://github.com/apache/paimon/pull/7014#discussion_r2684947107
##########
paimon-python/pypaimon/read/reader/shard_batch_reader.py:
##########
@@ -59,3 +60,78 @@ def read_arrow_batch(self) -> Optional[RecordBatch]:
def close(self):
self.reader.close()
+
+
+class SampleBatchReader(RecordBatchReader):
+ """
+ A reader that reads a subset of rows from a data file based on specified
sample positions.
+
+ This reader wraps another RecordBatchReader and only returns rows at the
specified
+ sample positions, enabling efficient random sampling of data without
reading all rows.
+
+ The reader supports two modes:
+ 1. For blob readers: Directly reads specific rows by index
+ 2. For other readers: Reads batches sequentially and extracts only the
sampled rows
+
+ Attributes:
+ reader: The underlying RecordBatchReader to read data from
+ sample_positions: A sorted list of row indices to sample (0-based)
+ sample_idx: Current index in the sample_positions list
+ current_pos: Current absolute row position in the data file
+ """
+
+ def __init__(self, reader, sample_positions):
+ """
+ Initialize the SampleBatchReader.
+
+ Args:
+ reader: The underlying RecordBatchReader to read data from
+ sample_positions: A sorted list of row indices to sample (0-based).
+ Must be sorted in ascending order for correct
behavior.
+ """
+ self.reader = reader
+ self.sample_positions = sample_positions
+ self.sample_idx = 0
+ self.current_pos = 0
+
+ def read_arrow_batch(self) -> Optional[RecordBatch]:
+ """
+ Read the next batch containing sampled rows.
+
+ This method reads data from the underlying reader and returns only the
rows
+ at the specified sample positions. The behavior differs based on
reader type:
+
+ - For FormatBlobReader: Directly reads individual rows by index
+ - For other readers: Reads batches sequentially and extracts sampled
rows
+ using PyArrow's take() method
+ """
+ if self.sample_idx >= len(self.sample_positions):
+ return None
+ if isinstance(self.reader.format_reader, FormatBlobReader):
+ # For blob reader, pass begin_idx and end_idx parameters
+ self.sample_idx += 1
+ return self.reader.read_arrow_batch(start_idx=self.sample_idx - 1,
end_idx=self.sample_idx)
Review Comment:
`sample_idx` or `self.sample_positions[self.sample_idx]`
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]