This is an automated email from the ASF dual-hosted git repository.
blue pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/iceberg.git
The following commit(s) were added to refs/heads/master by this push:
new a132372 Parquet: Avoid extra footer read when position column is not
needed (#1716)
a132372 is described below
commit a132372d59d212f45c6cfe532c271fcb5a1878a7
Author: Chen, Junjie <[email protected]>
AuthorDate: Sun Nov 8 06:46:28 2020 +0800
Parquet: Avoid extra footer read when position column is not needed (#1716)
---
.../src/main/java/org/apache/iceberg/parquet/ReadConf.java | 13 +++++++++----
1 file changed, 9 insertions(+), 4 deletions(-)
diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/ReadConf.java
b/parquet/src/main/java/org/apache/iceberg/parquet/ReadConf.java
index cb816da..78deb21 100644
--- a/parquet/src/main/java/org/apache/iceberg/parquet/ReadConf.java
+++ b/parquet/src/main/java/org/apache/iceberg/parquet/ReadConf.java
@@ -27,6 +27,7 @@ import java.util.Map;
import java.util.Set;
import java.util.function.Function;
import java.util.stream.Collectors;
+import org.apache.iceberg.MetadataColumns;
import org.apache.iceberg.Schema;
import org.apache.iceberg.exceptions.RuntimeIOException;
import org.apache.iceberg.expressions.Expression;
@@ -87,10 +88,10 @@ class ReadConf<T> {
this.rowGroups = reader.getRowGroups();
this.shouldSkip = new boolean[rowGroups.size()];
+ this.startRowPositions = new long[rowGroups.size()];
// Fetch all row groups starting positions to compute the row offsets of
the filtered row groups
- Map<Long, Long> offsetToStartPos = generateOffsetToStartPos();
- this.startRowPositions = new long[rowGroups.size()];
+ Map<Long, Long> offsetToStartPos =
generateOffsetToStartPos(expectedSchema);
ParquetMetricsRowGroupFilter statsFilter = null;
ParquetDictionaryRowGroupFilter dictFilter = null;
@@ -102,7 +103,7 @@ class ReadConf<T> {
long computedTotalValues = 0L;
for (int i = 0; i < shouldSkip.length; i += 1) {
BlockMetaData rowGroup = rowGroups.get(i);
- startRowPositions[i] = offsetToStartPos.get(rowGroup.getStartingPos());
+ startRowPositions[i] = offsetToStartPos == null ? 0 :
offsetToStartPos.get(rowGroup.getStartingPos());
boolean shouldRead = filter == null || (
statsFilter.shouldRead(typeWithIds, rowGroup) &&
dictFilter.shouldRead(typeWithIds, rowGroup,
reader.getDictionaryReader(rowGroup)));
@@ -166,7 +167,11 @@ class ReadConf<T> {
return shouldSkip;
}
- private Map<Long, Long> generateOffsetToStartPos() {
+ private Map<Long, Long> generateOffsetToStartPos(Schema schema) {
+ if (schema.findField(MetadataColumns.ROW_POSITION.fieldId()) == null) {
+ return null;
+ }
+
try (ParquetFileReader fileReader = newReader(file,
ParquetReadOptions.builder().build())) {
Map<Long, Long> offsetToStartPos = new HashMap<>();