rdblue commented on a change in pull request #204: Combine tasks to scan up to
target split size using parquet row group information
URL: https://github.com/apache/incubator-iceberg/pull/204#discussion_r290074023
##########
File path:
core/src/test/java/org/apache/iceberg/TestOffsetsBasedSplitScanTaskIterator.java
##########
@@ -25,26 +25,39 @@
import org.junit.Test;
public class TestOffsetsBasedSplitScanTaskIterator {
+
@Test
public void testSplits() {
// case when the last row group has more than one byte
- verify(asList(4L, 10L, 15L, 18L, 30L, 45L), 48L, asList(
- asList(4L, 6L), asList(10L, 5L), asList(15L, 3L), asList(18L, 12L),
asList(30L, 15L),
- asList(45L, 3L)));
+ verify(asList(4L, 10L, 15L, 18L, 30L, 45L), 48L, 20, asList(
+ asList(4L, 14L), asList(18L, 12L), asList(30L, 18L)));
// case when the last row group has 1 byte
- verify(asList(4L, 10L, 15L, 18L, 30L, 45L), 46L, asList(
- asList(4L, 6L), asList(10L, 5L), asList(15L, 3L), asList(18L, 12L),
asList(30L, 15L),
- asList(45L, 1L)));
+ verify(asList(4L, 10L, 15L, 18L, 30L, 45L), 46L, 20, asList(
+ asList(4L, 14L), asList(18L, 12L), asList(30L, 16L)));
+
+ // case when every row group is of target split size
+ verify(asList(4L, 24L, 44L, 64L, 84L, 104L), 124L, 20, asList(
+ asList(4L, 20L), asList(24L, 20L), asList(44L, 20L),
+ asList(64L, 20L), asList(84L, 20L), asList(104L, 20L)));
+
+ // case when every row group except last one is of target split size
+ verify(asList(4L, 24L, 44L, 64L, 84L, 104L), 108L, 20, asList(
+ asList(4L, 20L), asList(24L, 20L), asList(44L, 20L),
+ asList(64L, 20L), asList(84L, 20L), asList(104L, 4L)));
- // case when there is only one row group
- verify(asList(4L), 48L, asList(
- asList(4L, 44L)));
+ // case when target split size is smaller than splits determined by offset
boundaries
+ verify(asList(4L, 24L, 44L, 64L, 84L, 104L), 108L, 2, asList(
+ asList(4L, 20L), asList(24L, 20L), asList(44L, 20L),
+ asList(64L, 20L), asList(84L, 20L), asList(104L, 4L)));
}
- private static void verify(List<Long> offsetRanges, long fileLen,
List<List<Long>> offsetLenPairs) {
+ private static void verify(List<Long> offsetRanges, long fileLen,
+ long targetSplitSize, List<List<Long>>
offsetLenPairs) {
List<FileScanTask> tasks = Lists.newArrayList(
- new
BaseFileScanTask.OffsetsBasedSplitScanTaskIterator(offsetRanges, new
MockFileScanTask(fileLen)));
+ new
BaseFileScanTask.OffsetsAwareTargetSplitSizeScanTaskIterator(offsetRanges,
Review comment:
Nit: style should be to wrap at the function call and place all arguments on
the next line.
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]
With regards,
Apache Git Services
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]