This is an automated email from the ASF dual-hosted git repository.

zeroshade pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-go.git


The following commit(s) were added to refs/heads/main by this push:
     new cdfd175f perf(parquet/metadata): avoid constructing 
ColumnChunkMetaData in page index range determination (#831)
cdfd175f is described below

commit cdfd175fd4d7904c14357023164d600f041ed516
Author: Ondřej Pavela <[email protected]>
AuthorDate: Mon Jun 1 19:46:31 2026 +0200

    perf(parquet/metadata): avoid constructing ColumnChunkMetaData in page 
index range determination (#831)
    
    ### Rationale for this change
    
    `determinePageIndexRangesInRowGroup` called
    `RowGroupMetaData.ColumnChunk()` for every column in a row group,
    constructing a full `ColumnChunkMetaData` (allocating the struct,
    copying encoding slices, etc.) just to read two offset/length
    pairs. In a compaction workload processing hundreds of small Parquet
    files, this path alone accounted for 18.3 GB of allocation churn (36% of
    total allocations). The objects were immediately discarded after reading
    two int fields.
    
      ### What changes are included in this PR?
    
    Added lightweight `ColumnIndexLocation()` and `OffsetIndexLocation()`
    methods on `RowGroupMetaData` that read index locations directly from
    the underlying thrift struct with zero heap allocations. Updated
    `determinePageIndexRangesInRowGroup`
    to use these instead of constructing full `ColumnChunkMetaData` objects.
    
      ### Are these changes tested?
    
    Yes — existing `parquet/metadata` and `parquet/file` tests pass.
    Verified zero heap allocations on the hot path via escape analysis (`go
    build -gcflags='-m -m'`). Confirmed with heap profiles that
    `NewColumnChunkMetaData` allocations through
      this path dropped from 18.3 GB to 106 MB (99.4% reduction).
    
      ### Are there any user-facing changes?
    
    Two new public methods on `RowGroupMetaData`: `ColumnIndexLocation(i
    int) (IndexLocation, bool)` and `OffsetIndexLocation(i int)
    (IndexLocation, bool)`. No breaking changes.
---
 parquet/metadata/page_index.go                | 17 +++---
 parquet/metadata/page_index_benchmark_test.go | 84 +++++++++++++++++++++++++++
 parquet/metadata/row_group.go                 | 28 +++++++++
 3 files changed, 120 insertions(+), 9 deletions(-)

diff --git a/parquet/metadata/page_index.go b/parquet/metadata/page_index.go
index 7e0b3e44..63a207e7 100644
--- a/parquet/metadata/page_index.go
+++ b/parquet/metadata/page_index.go
@@ -443,7 +443,6 @@ func determinePageIndexRangesInRowGroup(rgMeta 
*RowGroupMetaData, cols []int32)
                return nil
        }
 
-       var colChunk *ColumnChunkMetaData
        if len(cols) == 0 {
                cols = make([]int32, rgMeta.NumColumns())
                for i := 0; i < rgMeta.NumColumns(); i++ {
@@ -456,16 +455,16 @@ func determinePageIndexRangesInRowGroup(rgMeta 
*RowGroupMetaData, cols []int32)
                        return rng, fmt.Errorf("%w: invalid column ordinal %d", 
arrow.ErrIndex, i)
                }
 
-               if colChunk, _ = rgMeta.ColumnChunk(int(i)); colChunk == nil {
-                       continue
-               }
-
-               if err = mergeRange(colChunk.GetColumnIndexLocation(), 
&ciStart, &ciEnd); err != nil {
-                       return
+               if colIdx, ok := rgMeta.ColumnIndexLocation(int(i)); ok {
+                       if err = mergeRange(&colIdx, &ciStart, &ciEnd); err != 
nil {
+                               return
+                       }
                }
 
-               if err = mergeRange(colChunk.GetOffsetIndexLocation(), 
&oiStart, &oiEnd); err != nil {
-                       return
+               if offsetIdx, ok := rgMeta.OffsetIndexLocation(int(i)); ok {
+                       if err = mergeRange(&offsetIdx, &oiStart, &oiEnd); err 
!= nil {
+                               return
+                       }
                }
        }
 
diff --git a/parquet/metadata/page_index_benchmark_test.go 
b/parquet/metadata/page_index_benchmark_test.go
new file mode 100644
index 00000000..ce1c5491
--- /dev/null
+++ b/parquet/metadata/page_index_benchmark_test.go
@@ -0,0 +1,84 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package metadata
+
+import (
+       "fmt"
+       "testing"
+
+       format "github.com/apache/arrow-go/v18/parquet/internal/gen-go/parquet"
+)
+
+func makeRowGroupMetaData(numCols int) *RowGroupMetaData {
+       columns := make([]*format.ColumnChunk, numCols)
+       for i := range numCols {
+               ciOffset := int64(1000 * (i + 1))
+               ciLength := int32(500)
+               oiOffset := int64(2000 * (i + 1))
+               oiLength := int32(300)
+               columns[i] = &format.ColumnChunk{
+                       ColumnIndexOffset: &ciOffset,
+                       ColumnIndexLength: &ciLength,
+                       OffsetIndexOffset: &oiOffset,
+                       OffsetIndexLength: &oiLength,
+               }
+       }
+
+       return NewRowGroupMetaData(&format.RowGroup{Columns: columns}, nil, 
nil, nil)
+}
+
+func TestDeterminePageIndexRangesInRowGroupAllocs(t *testing.T) {
+       rgMeta10 := makeRowGroupMetaData(10)
+       rgMeta100 := makeRowGroupMetaData(100)
+       cols10 := make([]int32, 10)
+       for i := range cols10 {
+               cols10[i] = int32(i)
+       }
+       cols100 := make([]int32, 100)
+       for i := range cols100 {
+               cols100[i] = int32(i)
+       }
+
+       allocs10 := testing.AllocsPerRun(100, func() {
+               if _, err := determinePageIndexRangesInRowGroup(rgMeta10, 
cols10); err != nil {
+                       t.Fatal(err)
+               }
+       })
+       allocs100 := testing.AllocsPerRun(100, func() {
+               if _, err := determinePageIndexRangesInRowGroup(rgMeta100, 
cols100); err != nil {
+                       t.Fatal(err)
+               }
+       })
+
+       if allocs10 != allocs100 {
+               t.Errorf("allocations should not scale with column count: 10 
cols = %v, 100 cols = %v", allocs10, allocs100)
+       }
+}
+
+func BenchmarkDeterminePageIndexRangesInRowGroup(b *testing.B) {
+       for _, numCols := range []int{10, 50, 100} {
+               rgMeta := makeRowGroupMetaData(numCols)
+               b.Run(fmt.Sprintf("cols=%d", numCols), func(b *testing.B) {
+                       b.ReportAllocs()
+                       for range b.N {
+                               if _, err := 
determinePageIndexRangesInRowGroup(rgMeta, nil); err != nil {
+                                       b.Fatal(err)
+                               }
+                       }
+               })
+       }
+}
diff --git a/parquet/metadata/row_group.go b/parquet/metadata/row_group.go
index 5ccd2e62..c8d19a01 100644
--- a/parquet/metadata/row_group.go
+++ b/parquet/metadata/row_group.go
@@ -91,6 +91,34 @@ func (r *RowGroupMetaData) SortingColumns() 
[]parquet.SortingColumn {
        return r.sortCols
 }
 
+// ColumnIndexLocation returns the column index location for a column chunk
+// directly from the underlying thrift struct, avoiding the overhead of
+// constructing a full ColumnChunkMetaData.
+func (r *RowGroupMetaData) ColumnIndexLocation(i int) (IndexLocation, bool) {
+       col := r.rowGroup.Columns[i]
+       if col.IsSetColumnIndexOffset() {
+               return IndexLocation{
+                       Offset: col.GetColumnIndexOffset(),
+                       Length: col.GetColumnIndexLength(),
+               }, true
+       }
+       return IndexLocation{}, false
+}
+
+// OffsetIndexLocation returns the offset index location for a column chunk
+// directly from the underlying thrift struct, avoiding the overhead of
+// constructing a full ColumnChunkMetaData.
+func (r *RowGroupMetaData) OffsetIndexLocation(i int) (IndexLocation, bool) {
+       col := r.rowGroup.Columns[i]
+       if col.IsSetOffsetIndexOffset() {
+               return IndexLocation{
+                       Offset: col.GetOffsetIndexOffset(),
+                       Length: col.GetOffsetIndexLength(),
+               }, true
+       }
+       return IndexLocation{}, false
+}
+
 // RowGroupMetaDataBuilder is a convenience object for constructing row group
 // metadata information. Primarily used in conjunction with writing new files.
 type RowGroupMetaDataBuilder struct {

Reply via email to