This is an automated email from the ASF dual-hosted git repository.
zeroshade pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-go.git
The following commit(s) were added to refs/heads/main by this push:
new cdfd175f perf(parquet/metadata): avoid constructing
ColumnChunkMetaData in page index range determination (#831)
cdfd175f is described below
commit cdfd175fd4d7904c14357023164d600f041ed516
Author: Ondřej Pavela <[email protected]>
AuthorDate: Mon Jun 1 19:46:31 2026 +0200
perf(parquet/metadata): avoid constructing ColumnChunkMetaData in page
index range determination (#831)
### Rationale for this change
`determinePageIndexRangesInRowGroup` called
`RowGroupMetaData.ColumnChunk()` for every column in a row group,
constructing a full `ColumnChunkMetaData` (allocating the struct,
copying encoding slices, etc.) just to read two offset/length
pairs. In a compaction workload processing hundreds of small Parquet
files, this path alone accounted for 18.3 GB of allocation churn (36% of
total allocations). The objects were immediately discarded after reading
two int fields.
### What changes are included in this PR?
Added lightweight `ColumnIndexLocation()` and `OffsetIndexLocation()`
methods on `RowGroupMetaData` that read index locations directly from
the underlying thrift struct with zero heap allocations. Updated
`determinePageIndexRangesInRowGroup`
to use these instead of constructing full `ColumnChunkMetaData` objects.
### Are these changes tested?
Yes — existing `parquet/metadata` and `parquet/file` tests pass.
Verified zero heap allocations on the hot path via escape analysis (`go
build -gcflags='-m -m'`). Confirmed with heap profiles that
`NewColumnChunkMetaData` allocations through
this path dropped from 18.3 GB to 106 MB (99.4% reduction).
### Are there any user-facing changes?
Two new public methods on `RowGroupMetaData`: `ColumnIndexLocation(i
int) (IndexLocation, bool)` and `OffsetIndexLocation(i int)
(IndexLocation, bool)`. No breaking changes.
---
parquet/metadata/page_index.go | 17 +++---
parquet/metadata/page_index_benchmark_test.go | 84 +++++++++++++++++++++++++++
parquet/metadata/row_group.go | 28 +++++++++
3 files changed, 120 insertions(+), 9 deletions(-)
diff --git a/parquet/metadata/page_index.go b/parquet/metadata/page_index.go
index 7e0b3e44..63a207e7 100644
--- a/parquet/metadata/page_index.go
+++ b/parquet/metadata/page_index.go
@@ -443,7 +443,6 @@ func determinePageIndexRangesInRowGroup(rgMeta
*RowGroupMetaData, cols []int32)
return nil
}
- var colChunk *ColumnChunkMetaData
if len(cols) == 0 {
cols = make([]int32, rgMeta.NumColumns())
for i := 0; i < rgMeta.NumColumns(); i++ {
@@ -456,16 +455,16 @@ func determinePageIndexRangesInRowGroup(rgMeta
*RowGroupMetaData, cols []int32)
return rng, fmt.Errorf("%w: invalid column ordinal %d",
arrow.ErrIndex, i)
}
- if colChunk, _ = rgMeta.ColumnChunk(int(i)); colChunk == nil {
- continue
- }
-
- if err = mergeRange(colChunk.GetColumnIndexLocation(),
&ciStart, &ciEnd); err != nil {
- return
+ if colIdx, ok := rgMeta.ColumnIndexLocation(int(i)); ok {
+ if err = mergeRange(&colIdx, &ciStart, &ciEnd); err !=
nil {
+ return
+ }
}
- if err = mergeRange(colChunk.GetOffsetIndexLocation(),
&oiStart, &oiEnd); err != nil {
- return
+ if offsetIdx, ok := rgMeta.OffsetIndexLocation(int(i)); ok {
+ if err = mergeRange(&offsetIdx, &oiStart, &oiEnd); err
!= nil {
+ return
+ }
}
}
diff --git a/parquet/metadata/page_index_benchmark_test.go
b/parquet/metadata/page_index_benchmark_test.go
new file mode 100644
index 00000000..ce1c5491
--- /dev/null
+++ b/parquet/metadata/page_index_benchmark_test.go
@@ -0,0 +1,84 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package metadata
+
+import (
+ "fmt"
+ "testing"
+
+ format "github.com/apache/arrow-go/v18/parquet/internal/gen-go/parquet"
+)
+
+func makeRowGroupMetaData(numCols int) *RowGroupMetaData {
+ columns := make([]*format.ColumnChunk, numCols)
+ for i := range numCols {
+ ciOffset := int64(1000 * (i + 1))
+ ciLength := int32(500)
+ oiOffset := int64(2000 * (i + 1))
+ oiLength := int32(300)
+ columns[i] = &format.ColumnChunk{
+ ColumnIndexOffset: &ciOffset,
+ ColumnIndexLength: &ciLength,
+ OffsetIndexOffset: &oiOffset,
+ OffsetIndexLength: &oiLength,
+ }
+ }
+
+ return NewRowGroupMetaData(&format.RowGroup{Columns: columns}, nil,
nil, nil)
+}
+
+func TestDeterminePageIndexRangesInRowGroupAllocs(t *testing.T) {
+ rgMeta10 := makeRowGroupMetaData(10)
+ rgMeta100 := makeRowGroupMetaData(100)
+ cols10 := make([]int32, 10)
+ for i := range cols10 {
+ cols10[i] = int32(i)
+ }
+ cols100 := make([]int32, 100)
+ for i := range cols100 {
+ cols100[i] = int32(i)
+ }
+
+ allocs10 := testing.AllocsPerRun(100, func() {
+ if _, err := determinePageIndexRangesInRowGroup(rgMeta10,
cols10); err != nil {
+ t.Fatal(err)
+ }
+ })
+ allocs100 := testing.AllocsPerRun(100, func() {
+ if _, err := determinePageIndexRangesInRowGroup(rgMeta100,
cols100); err != nil {
+ t.Fatal(err)
+ }
+ })
+
+ if allocs10 != allocs100 {
+ t.Errorf("allocations should not scale with column count: 10
cols = %v, 100 cols = %v", allocs10, allocs100)
+ }
+}
+
+func BenchmarkDeterminePageIndexRangesInRowGroup(b *testing.B) {
+ for _, numCols := range []int{10, 50, 100} {
+ rgMeta := makeRowGroupMetaData(numCols)
+ b.Run(fmt.Sprintf("cols=%d", numCols), func(b *testing.B) {
+ b.ReportAllocs()
+ for range b.N {
+ if _, err :=
determinePageIndexRangesInRowGroup(rgMeta, nil); err != nil {
+ b.Fatal(err)
+ }
+ }
+ })
+ }
+}
diff --git a/parquet/metadata/row_group.go b/parquet/metadata/row_group.go
index 5ccd2e62..c8d19a01 100644
--- a/parquet/metadata/row_group.go
+++ b/parquet/metadata/row_group.go
@@ -91,6 +91,34 @@ func (r *RowGroupMetaData) SortingColumns()
[]parquet.SortingColumn {
return r.sortCols
}
+// ColumnIndexLocation returns the column index location for a column chunk
+// directly from the underlying thrift struct, avoiding the overhead of
+// constructing a full ColumnChunkMetaData.
+func (r *RowGroupMetaData) ColumnIndexLocation(i int) (IndexLocation, bool) {
+ col := r.rowGroup.Columns[i]
+ if col.IsSetColumnIndexOffset() {
+ return IndexLocation{
+ Offset: col.GetColumnIndexOffset(),
+ Length: col.GetColumnIndexLength(),
+ }, true
+ }
+ return IndexLocation{}, false
+}
+
+// OffsetIndexLocation returns the offset index location for a column chunk
+// directly from the underlying thrift struct, avoiding the overhead of
+// constructing a full ColumnChunkMetaData.
+func (r *RowGroupMetaData) OffsetIndexLocation(i int) (IndexLocation, bool) {
+ col := r.rowGroup.Columns[i]
+ if col.IsSetOffsetIndexOffset() {
+ return IndexLocation{
+ Offset: col.GetOffsetIndexOffset(),
+ Length: col.GetOffsetIndexLength(),
+ }, true
+ }
+ return IndexLocation{}, false
+}
+
// RowGroupMetaDataBuilder is a convenience object for constructing row group
// metadata information. Primarily used in conjunction with writing new files.
type RowGroupMetaDataBuilder struct {