This is an automated email from the ASF dual-hosted git repository.
gangwu pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/parquet-java.git
The following commit(s) were added to refs/heads/master by this push:
new 68fa7310f GH-3320: Ensure parquet reader does not fail due to
incorrect statistics (#3325)
68fa7310f is described below
commit 68fa7310f0333de2729928a0df200ac423c7673d
Author: Arnav Balyan <[email protected]>
AuthorDate: Mon Sep 29 21:10:11 2025 +0530
GH-3320: Ensure parquet reader does not fail due to incorrect statistics
(#3325)
---
.../filter2/columnindex/ColumnIndexFilter.java | 35 +++++++++++++++++++++-
1 file changed, 34 insertions(+), 1 deletion(-)
diff --git
a/parquet-column/src/main/java/org/apache/parquet/internal/filter2/columnindex/ColumnIndexFilter.java
b/parquet-column/src/main/java/org/apache/parquet/internal/filter2/columnindex/ColumnIndexFilter.java
index 8b6ee1f95..fd26e54d7 100644
---
a/parquet-column/src/main/java/org/apache/parquet/internal/filter2/columnindex/ColumnIndexFilter.java
+++
b/parquet-column/src/main/java/org/apache/parquet/internal/filter2/columnindex/ColumnIndexFilter.java
@@ -192,7 +192,12 @@ public class ColumnIndexFilter implements
Visitor<RowRanges> {
return allRows();
}
- return RowRanges.create(rowCount, func.apply(ci), oi);
+ if (!isValidIndexSize(ci, oi, columnPath)) {
+ return allRows();
+ }
+
+ PrimitiveIterator.OfInt pageIndexes = func.apply(ci);
+ return RowRanges.create(rowCount, pageIndexes, oi);
}
@Override
@@ -220,4 +225,32 @@ public class ColumnIndexFilter implements
Visitor<RowRanges> {
throw new IllegalArgumentException(
"Predicates containing a NOT must be run through
LogicalInverseRewriter. " + not);
}
+
+ /**
+ * Validates that column index and offset index metadata are consistent and
can be used safely.
+ *
+ * @param columnIndex the column index to validate
+ * @param offsetIndex the offset index to validate
+ * @param columnPath the column path for error reporting
+ * @return true if metadata is valid and safe to use, false if corrupt and
should be ignored
+ */
+ private static boolean isValidIndexSize(ColumnIndex columnIndex, OffsetIndex
offsetIndex, ColumnPath columnPath) {
+
+ int columnIndexSize = columnIndex.getMinValues().size();
+ int offsetIndexSize = offsetIndex.getPageCount();
+
+ if (columnIndexSize != offsetIndexSize) {
+ LOGGER.warn(
+ "Column index and offset index size mismatch for column {}: "
+ + "column index has {} entries but offset index has {} pages. "
+ + "This indicates corrupted metadata from the writer. "
+ + "Ignoring column index for filtering to avoid errors.",
+ columnPath,
+ columnIndexSize,
+ offsetIndexSize);
+ return false;
+ }
+
+ return true;
+ }
}