This is an automated email from the ASF dual-hosted git repository.
gangwu pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/parquet-java.git
The following commit(s) were added to refs/heads/master by this push:
new 4d2f1ee16 GH-3331: Track Column index page skip statistics during file
read (#3330)
4d2f1ee16 is described below
commit 4d2f1ee16469562d1a9d155f9ad3de0d1d499f9f
Author: Arnav Balyan <[email protected]>
AuthorDate: Thu Sep 25 11:47:31 2025 +0530
GH-3331: Track Column index page skip statistics during file read (#3330)
---
.../parquet/hadoop/ColumnIndexFilterUtils.java | 21 ++++++++++++++++++++-
.../apache/parquet/hadoop/ParquetFileReader.java | 3 ++-
.../parquet/hadoop/ParquetFileReaderMetrics.java | 4 +++-
3 files changed, 25 insertions(+), 3 deletions(-)
diff --git
a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ColumnIndexFilterUtils.java
b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ColumnIndexFilterUtils.java
index 4fb57ee40..e78381574 100644
---
a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ColumnIndexFilterUtils.java
+++
b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ColumnIndexFilterUtils.java
@@ -18,6 +18,9 @@
*/
package org.apache.parquet.hadoop;
+import static org.apache.parquet.hadoop.ParquetFileReaderMetrics.PagesIncluded;
+import static org.apache.parquet.hadoop.ParquetFileReaderMetrics.PagesSkipped;
+
import it.unimi.dsi.fastutil.ints.IntArrayList;
import it.unimi.dsi.fastutil.ints.IntList;
import java.util.ArrayList;
@@ -129,14 +132,30 @@ class ColumnIndexFilterUtils {
/*
* Returns the filtered offset index containing only the pages which are
overlapping with rowRanges.
*/
- static OffsetIndex filterOffsetIndex(OffsetIndex offsetIndex, RowRanges
rowRanges, long totalRowCount) {
+ static OffsetIndex filterOffsetIndex(
+ OffsetIndex offsetIndex,
+ RowRanges rowRanges,
+ long totalRowCount,
+ org.apache.parquet.ParquetReadOptions options) {
IntList indexMap = new IntArrayList();
+ int pagesIncluded = 0;
+ int pagesSkipped = 0;
for (int i = 0, n = offsetIndex.getPageCount(); i < n; ++i) {
long from = offsetIndex.getFirstRowIndex(i);
if (rowRanges.isOverlapping(from, offsetIndex.getLastRowIndex(i,
totalRowCount))) {
indexMap.add(i);
+ pagesIncluded++;
+ } else {
+ pagesSkipped++;
}
}
+
+ if (options != null && options.getMetricsCallback() != null) {
+ final ParquetMetricsCallback metricsCallback =
options.getMetricsCallback();
+ metricsCallback.setValueInt(PagesIncluded.name(), pagesIncluded);
+ metricsCallback.setValueInt(PagesSkipped.name(), pagesSkipped);
+ }
+
return new FilteredOffsetIndex(offsetIndex, indexMap.toIntArray());
}
diff --git
a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileReader.java
b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileReader.java
index 2ef39f780..551b1bf6c 100644
---
a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileReader.java
+++
b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileReader.java
@@ -1423,7 +1423,8 @@ public class ParquetFileReader implements Closeable {
if (columnDescriptor != null) {
OffsetIndex offsetIndex = ciStore.getOffsetIndex(mc.getPath());
- OffsetIndex filteredOffsetIndex = filterOffsetIndex(offsetIndex,
rowRanges, block.getRowCount());
+ OffsetIndex filteredOffsetIndex =
+ filterOffsetIndex(offsetIndex, rowRanges, block.getRowCount(),
options);
for (OffsetRange range : calculateOffsetRanges(filteredOffsetIndex,
mc, offsetIndex.getOffset(0))) {
BenchmarkCounter.incrementTotalBytes(range.getLength());
long startingPos = range.getOffset();
diff --git
a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileReaderMetrics.java
b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileReaderMetrics.java
index 737e6abb9..6c4b7fa3b 100644
---
a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileReaderMetrics.java
+++
b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileReaderMetrics.java
@@ -27,7 +27,9 @@ public enum ParquetFileReaderMetrics {
ReadThroughput("read throughput when reading Parquet file from storage
(MB/sec)"),
DecompressTime("time spent in block decompression"),
DecompressSize("decompressed data size (MB)"),
- DecompressThroughput("block decompression throughput (MB/sec)");
+ DecompressThroughput("block decompression throughput (MB/sec)"),
+ PagesIncluded("pages included due to column index filtering"),
+ PagesSkipped("pages skipped due to column index filtering");
private final String desc;