PARQUET-569: Separate metadata filtering for ranges and offsets. Range filtering should use the row group midpoint and offset filtering should use the start offset.
Author: Ryan Blue <[email protected]> Closes #337 from rdblue/PARQUET-569-fix-metadata-filter and squashes the following commits: 6171af4 [Ryan Blue] PARQUET-569: Add tests for new offset metadata filter. 3fe2d5e [Ryan Blue] PARQUET-569: Separate metadata filtering for ranges and offsets. Project: http://git-wip-us.apache.org/repos/asf/parquet-mr/repo Commit: http://git-wip-us.apache.org/repos/asf/parquet-mr/commit/fd227473 Tree: http://git-wip-us.apache.org/repos/asf/parquet-mr/tree/fd227473 Diff: http://git-wip-us.apache.org/repos/asf/parquet-mr/diff/fd227473 Branch: refs/heads/parquet-1.8.x Commit: fd227473c119b9caf89ad2ac11f0c63ab70ce403 Parents: d901cf9 Author: Ryan Blue <[email protected]> Authored: Fri Apr 22 17:42:35 2016 -0700 Committer: Ryan Blue <[email protected]> Committed: Mon Jan 9 16:54:53 2017 -0800 ---------------------------------------------------------------------- .../converter/ParquetMetadataConverter.java | 36 ++++++++++++++------ .../converter/TestParquetMetadataConverter.java | 31 +++++++++++++++-- 2 files changed, 54 insertions(+), 13 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/fd227473/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java ---------------------------------------------------------------------- diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java b/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java index 6feb4a2..75b07fd 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java @@ -478,6 +478,7 @@ public class ParquetMetadataConverter { private static interface MetadataFilterVisitor<T, E extends Throwable> { T visit(NoFilter filter) throws E; T visit(SkipMetadataFilter filter) throws E; + T visit(RangeMetadataFilter filter) throws E; T visit(OffsetMetadataFilter filter) throws E; } @@ -501,7 +502,7 @@ public class ParquetMetadataConverter { for (long offset : offsets) { set.add(offset); } - return new OffsetListMetadataFilter(set); + return new OffsetMetadataFilter(set); } private static final class NoFilter extends MetadataFilter { @@ -527,16 +528,12 @@ public class ParquetMetadataConverter { } } - interface OffsetMetadataFilter { - boolean contains(long offset); - } - /** * [ startOffset, endOffset ) * @author Julien Le Dem */ // Visible for testing - static final class RangeMetadataFilter extends MetadataFilter implements OffsetMetadataFilter { + static final class RangeMetadataFilter extends MetadataFilter { final long startOffset; final long endOffset; @@ -551,7 +548,6 @@ public class ParquetMetadataConverter { return visitor.visit(this); } - @Override public boolean contains(long offset) { return offset >= this.startOffset && offset < this.endOffset; } @@ -562,10 +558,10 @@ public class ParquetMetadataConverter { } } - static final class OffsetListMetadataFilter extends MetadataFilter implements OffsetMetadataFilter { + static final class OffsetMetadataFilter extends MetadataFilter { private final Set<Long> offsets; - public OffsetListMetadataFilter(Set<Long> offsets) { + public OffsetMetadataFilter(Set<Long> offsets) { this.offsets = offsets; } @@ -585,7 +581,7 @@ public class ParquetMetadataConverter { } // Visible for testing - static FileMetaData filterFileMetaData(FileMetaData metaData, OffsetMetadataFilter filter) { + static FileMetaData filterFileMetaDataByMidpoint(FileMetaData metaData, RangeMetadataFilter filter) { List<RowGroup> rowGroups = metaData.getRow_groups(); List<RowGroup> newRowGroups = new ArrayList<RowGroup>(); for (RowGroup rowGroup : rowGroups) { @@ -604,6 +600,19 @@ public class ParquetMetadataConverter { } // Visible for testing + static FileMetaData filterFileMetaDataByStart(FileMetaData metaData, OffsetMetadataFilter filter) { + List<RowGroup> rowGroups = metaData.getRow_groups(); + List<RowGroup> newRowGroups = new ArrayList<RowGroup>(); + for (RowGroup rowGroup : rowGroups) { + long startIndex = getOffset(rowGroup.getColumns().get(0)); + if (filter.contains(startIndex)) { + newRowGroups.add(rowGroup); + } + } + metaData.setRow_groups(newRowGroups); + return metaData; + } + static long getOffset(RowGroup rowGroup) { return getOffset(rowGroup.getColumns().get(0)); } @@ -631,7 +640,12 @@ public class ParquetMetadataConverter { @Override public FileMetaData visit(OffsetMetadataFilter filter) throws IOException { - return filterFileMetaData(readFileMetaData(from), filter); + return filterFileMetaDataByStart(readFileMetaData(from), filter); + } + + @Override + public FileMetaData visit(RangeMetadataFilter filter) throws IOException { + return filterFileMetaDataByMidpoint(readFileMetaData(from), filter); } }); if (Log.DEBUG) LOG.debug(fileMetaData); http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/fd227473/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java ---------------------------------------------------------------------- diff --git a/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java b/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java index eb109c0..b9cfde7 100644 --- a/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java +++ b/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java @@ -19,6 +19,7 @@ package org.apache.parquet.format.converter; import static java.util.Collections.emptyList; +import static org.apache.parquet.format.converter.ParquetMetadataConverter.filterFileMetaDataByStart; import static org.apache.parquet.schema.MessageTypeParser.parseMessageType; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertSame; @@ -27,7 +28,7 @@ import static org.apache.parquet.format.CompressionCodec.UNCOMPRESSED; import static org.apache.parquet.format.Type.INT32; import static org.apache.parquet.format.Util.readPageHeader; import static org.apache.parquet.format.Util.writePageHeader; -import static org.apache.parquet.format.converter.ParquetMetadataConverter.filterFileMetaData; +import static org.apache.parquet.format.converter.ParquetMetadataConverter.filterFileMetaDataByMidpoint; import static org.apache.parquet.format.converter.ParquetMetadataConverter.getOffset; import java.io.ByteArrayInputStream; @@ -43,6 +44,7 @@ import java.util.Random; import java.util.Set; import java.util.TreeSet; +import com.google.common.collect.Sets; import org.apache.parquet.column.statistics.BinaryStatistics; import org.apache.parquet.hadoop.metadata.BlockMetaData; import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData; @@ -170,7 +172,20 @@ public class TestParquetMetadataConverter { } private FileMetaData filter(FileMetaData md, long start, long end) { - return filterFileMetaData(new FileMetaData(md), new ParquetMetadataConverter.RangeMetadataFilter(start, end)); + return filterFileMetaDataByMidpoint(new FileMetaData(md), + new ParquetMetadataConverter.RangeMetadataFilter(start, end)); + } + + private FileMetaData find(FileMetaData md, Long... blockStart) { + return filterFileMetaDataByStart(new FileMetaData(md), + new ParquetMetadataConverter.OffsetMetadataFilter( + Sets.newHashSet((Long[]) blockStart))); + } + + private FileMetaData find(FileMetaData md, long blockStart) { + return filterFileMetaDataByStart(new FileMetaData(md), + new ParquetMetadataConverter.OffsetMetadataFilter( + Sets.newHashSet(blockStart))); } private void verifyMD(FileMetaData md, long... offsets) { @@ -243,6 +258,18 @@ public class TestParquetMetadataConverter { } @Test + public void testFindRowGroups() { + verifyMD(find(metadata(50, 50, 50), 0), 0); + verifyMD(find(metadata(50, 50, 50), 50), 50); + verifyMD(find(metadata(50, 50, 50), 100), 100); + verifyMD(find(metadata(50, 50, 50), 0L, 50L), 0, 50); + verifyMD(find(metadata(50, 50, 50), 0L, 50L, 100L), 0, 50, 100); + verifyMD(find(metadata(50, 50, 50), 50L, 100L), 50, 100); + // doesn't find an offset that isn't the start of a row group. + verifyMD(find(metadata(50, 50, 50), 10)); + } + + @Test public void randomTestFilterMetaData() { // randomized property based testing // if it fails add the case above
