This is an automated email from the ASF dual-hosted git repository.

gangwu pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/parquet-java.git


The following commit(s) were added to refs/heads/master by this push:
     new 68fa7310f GH-3320: Ensure parquet reader does not fail due to 
incorrect statistics (#3325)
68fa7310f is described below

commit 68fa7310f0333de2729928a0df200ac423c7673d
Author: Arnav Balyan <[email protected]>
AuthorDate: Mon Sep 29 21:10:11 2025 +0530

    GH-3320: Ensure parquet reader does not fail due to incorrect statistics 
(#3325)
---
 .../filter2/columnindex/ColumnIndexFilter.java     | 35 +++++++++++++++++++++-
 1 file changed, 34 insertions(+), 1 deletion(-)

diff --git 
a/parquet-column/src/main/java/org/apache/parquet/internal/filter2/columnindex/ColumnIndexFilter.java
 
b/parquet-column/src/main/java/org/apache/parquet/internal/filter2/columnindex/ColumnIndexFilter.java
index 8b6ee1f95..fd26e54d7 100644
--- 
a/parquet-column/src/main/java/org/apache/parquet/internal/filter2/columnindex/ColumnIndexFilter.java
+++ 
b/parquet-column/src/main/java/org/apache/parquet/internal/filter2/columnindex/ColumnIndexFilter.java
@@ -192,7 +192,12 @@ public class ColumnIndexFilter implements 
Visitor<RowRanges> {
       return allRows();
     }
 
-    return RowRanges.create(rowCount, func.apply(ci), oi);
+    if (!isValidIndexSize(ci, oi, columnPath)) {
+      return allRows();
+    }
+
+    PrimitiveIterator.OfInt pageIndexes = func.apply(ci);
+    return RowRanges.create(rowCount, pageIndexes, oi);
   }
 
   @Override
@@ -220,4 +225,32 @@ public class ColumnIndexFilter implements 
Visitor<RowRanges> {
     throw new IllegalArgumentException(
         "Predicates containing a NOT must be run through 
LogicalInverseRewriter. " + not);
   }
+
+  /**
+   * Validates that column index and offset index metadata are consistent and 
can be used safely.
+   *
+   * @param columnIndex the column index to validate
+   * @param offsetIndex the offset index to validate
+   * @param columnPath the column path for error reporting
+   * @return true if metadata is valid and safe to use, false if corrupt and 
should be ignored
+   */
+  private static boolean isValidIndexSize(ColumnIndex columnIndex, OffsetIndex 
offsetIndex, ColumnPath columnPath) {
+
+    int columnIndexSize = columnIndex.getMinValues().size();
+    int offsetIndexSize = offsetIndex.getPageCount();
+
+    if (columnIndexSize != offsetIndexSize) {
+      LOGGER.warn(
+          "Column index and offset index size mismatch for column {}: "
+              + "column index has {} entries but offset index has {} pages. "
+              + "This indicates corrupted metadata from the writer. "
+              + "Ignoring column index for filtering to avoid errors.",
+          columnPath,
+          columnIndexSize,
+          offsetIndexSize);
+      return false;
+    }
+
+    return true;
+  }
 }

Reply via email to