This is an automated email from the ASF dual-hosted git repository.
zeroshade pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-go.git
The following commit(s) were added to refs/heads/main by this push:
new efc538a6 fix(parquet/pqarrow): fix Decimal256 sign extension (#711)
efc538a6 is described below
commit efc538a67dada6e189c964a4fc879de9d8622ee2
Author: Dima Kuznetsov <[email protected]>
AuthorDate: Fri Mar 13 18:22:21 2026 +0200
fix(parquet/pqarrow): fix Decimal256 sign extension (#711)
bigEndianToDecimal256 in column_readers.go has a bug in the partial-word
sign extension path: it shifts by wordLen (byte count) instead of
wordLen*8 (bit count).
### Rationale for this change
Found while reading a parquet file w/ a certain Decimal256 using
go-arrow
### What changes are included in this PR?
fix for bigEndianToDecimal256
### Are these changes tested?
Yes, round-trip test included
### Are there any user-facing changes?
None
---
parquet/pqarrow/column_readers.go | 4 +-
parquet/pqarrow/encode_arrow_test.go | 91 ++++++++++++++++++++++++++++++++++++
2 files changed, 93 insertions(+), 2 deletions(-)
diff --git a/parquet/pqarrow/column_readers.go
b/parquet/pqarrow/column_readers.go
index 522e29c5..1add79f6 100644
--- a/parquet/pqarrow/column_readers.go
+++ b/parquet/pqarrow/column_readers.go
@@ -939,8 +939,8 @@ func bigEndianToDecimal256(buf []byte) (decimal256.Num,
error) {
result := initWord
if len(buf) > 0 {
// incorporate the actual values if present
- // shift left enough bits to make room for the
incoming int64
- result = result << uint64(wordLen)
+ // shift left enough bits to make room for the
incoming bytes
+ result = result << uint64(wordLen*8)
// preserve the upper bits by inplace OR-ing
the int64
result |= uint64FromBigEndianShifted(word)
}
diff --git a/parquet/pqarrow/encode_arrow_test.go
b/parquet/pqarrow/encode_arrow_test.go
index 8b11e75a..b60a960c 100644
--- a/parquet/pqarrow/encode_arrow_test.go
+++ b/parquet/pqarrow/encode_arrow_test.go
@@ -19,8 +19,10 @@ package pqarrow_test
import (
"bytes"
"context"
+ "encoding/binary"
"fmt"
"math"
+ "math/big"
"strconv"
"strings"
"testing"
@@ -1229,6 +1231,95 @@ func (ps *ParquetIOTestSuite) TestReadDecimal256() {
ps.Truef(array.Equal(expected, chunked.Chunk(0)), "expected: %s\ngot:
%s", expected, chunked.Chunk(0))
}
+// TestReadDecimal256PartialWord verifies that bigEndianToDecimal256 correctly
+// handles byte arrays whose length is not a multiple of 8, exercising the
+// partial-word sign-extension path. This is a regression test for a bug where
+// the shift was by wordLen (bytes) instead of wordLen*8 (bits).
+func (ps *ParquetIOTestSuite) TestReadDecimal256PartialWord() {
+ // decimal256ToBigEndian converts a big.Int to big-endian two's
complement
+ // bytes truncated to byteWidth, simulating what the Parquet encoder
writes.
+ decimal256ToBigEndian := func(bi *big.Int, byteWidth int) []byte {
+ num := decimal256.FromBigInt(bi)
+ vals := num.Array()
+ var full [32]byte
+ binary.BigEndian.PutUint64(full[0:], vals[3])
+ binary.BigEndian.PutUint64(full[8:], vals[2])
+ binary.BigEndian.PutUint64(full[16:], vals[1])
+ binary.BigEndian.PutUint64(full[24:], vals[0])
+ return append([]byte{}, full[32-byteWidth:]...)
+ }
+
+ // maxForPrecision returns 10^precision - 1 as a *big.Int.
+ maxForPrecision := func(precision int) *big.Int {
+ v := new(big.Int).Exp(big.NewInt(10),
big.NewInt(int64(precision)), nil)
+ return v.Sub(v, big.NewInt(1))
+ }
+
+ // One precision per partial-word remainder (1-7), plus remainder 0 as
sanity.
+ // precision : DecimalSize : byteWidth % 8
+ // 39 : 17 : 1 41 : 18 : 2 45 : 19 : 3 47 : 20 : 4
+ // 49 : 21 : 5 51 : 22 : 6 53 : 23 : 7 76 : 32 : 0
+ precisions := []int32{39, 41, 45, 47, 49, 51, 53, 76}
+
+ for _, precision := range precisions {
+ byteWidth := int(pqarrow.DecimalSize(precision))
+ maxVal := maxForPrecision(int(precision))
+ minVal := new(big.Int).Neg(maxVal)
+
+ tests := []struct {
+ name string
+ val *big.Int
+ }{
+ {"max_positive", maxVal},
+ {"max_negative", minVal},
+ {"zero", big.NewInt(0)},
+ {"minus_one", big.NewInt(-1)},
+ }
+
+ for _, tt := range tests {
+ ps.Run(fmt.Sprintf("p%d_bw%d_r%d/%s", precision,
byteWidth, byteWidth%8, tt.name), func() {
+ mem :=
memory.NewCheckedAllocator(memory.DefaultAllocator)
+ defer mem.AssertSize(ps.T(), 0)
+
+ bigEndian :=
[]parquet.ByteArray{decimal256ToBigEndian(tt.val, byteWidth)}
+
+ bldr := array.NewDecimal256Builder(mem,
&arrow.Decimal256Type{Precision: precision, Scale: 0})
+ defer bldr.Release()
+ bldr.Append(decimal256.FromBigInt(tt.val))
+ expected := bldr.NewDecimal256Array()
+ defer expected.Release()
+
+ sc :=
schema.MustGroup(schema.NewGroupNode("schema", parquet.Repetitions.Required,
schema.FieldList{
+
schema.Must(schema.NewPrimitiveNodeLogical("decimals",
parquet.Repetitions.Required,
+
schema.NewDecimalLogicalType(precision, 0), parquet.Types.ByteArray, -1, -1)),
+ }, -1))
+
+ sink := encoding.NewBufferWriter(0, mem)
+ defer sink.Release()
+ writer := file.NewParquetWriter(sink, sc)
+ rgw := writer.AppendRowGroup()
+ cw, _ := rgw.NextColumn()
+
cw.(*file.ByteArrayColumnChunkWriter).WriteBatch(bigEndian, nil, nil)
+ cw.Close()
+ rgw.Close()
+ writer.Close()
+
+ rdr := ps.createReader(mem, sink.Bytes())
+ cr, err := rdr.GetColumn(context.TODO(), 0)
+ ps.Require().NoError(err)
+
+ chunked, err := cr.NextBatch(smallSize)
+ ps.Require().NoError(err)
+ defer chunked.Release()
+
+ ps.Require().Len(chunked.Chunks(), 1)
+ ps.Truef(array.Equal(expected,
chunked.Chunk(0)),
+ "expected: %s\ngot: %s", expected,
chunked.Chunk(0))
+ })
+ }
+ }
+}
+
func (ps *ParquetIOTestSuite) TestReadNestedStruct() {
mem := memory.NewCheckedAllocator(memory.DefaultAllocator)
defer mem.AssertSize(ps.T(), 0)