This is an automated email from the ASF dual-hosted git repository.

zeroshade pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-go.git


The following commit(s) were added to refs/heads/main by this push:
     new f464c83  fix(parquet): TestDeltaByteArray implementation and fix (#369)
f464c83 is described below

commit f464c83425d707f2ba25857ec9852552165adc4e
Author: Victor Perez <[email protected]>
AuthorDate: Sat May 10 17:47:05 2025 +0200

    fix(parquet): TestDeltaByteArray implementation and fix (#369)
    
    This exact test seems to be passing on v17 version. More investigation
    is needed
    
    ### Rationale for this change
    
    After updating from v17 to v18 I noticed a test case failing that was
    passing before.
    
    
    ### What changes are included in this PR?
    
    This PR contains a test case to reproduce the issue. A reference PR for
    version v17 can be found here
    - https://github.com/apache/arrow/pull/46353
    
    
    ### Are these changes tested?
    
    The PR itself just contains the test
    
    ### Are there any user-facing changes?
    
    ---------
    
    Co-authored-by: Matt Topol <[email protected]>
---
 parquet/file/file_reader_test.go               | 52 ++++++++++++++++++++++++++
 parquet/internal/encoding/delta_bit_packing.go |  1 +
 parquet/internal/encoding/delta_byte_array.go  |  2 +-
 3 files changed, 54 insertions(+), 1 deletion(-)

diff --git a/parquet/file/file_reader_test.go b/parquet/file/file_reader_test.go
index a850fab..d15b4df 100644
--- a/parquet/file/file_reader_test.go
+++ b/parquet/file/file_reader_test.go
@@ -21,6 +21,7 @@ import (
        "context"
        "crypto/rand"
        "encoding/binary"
+       "encoding/csv"
        "fmt"
        "io"
        "os"
@@ -820,3 +821,54 @@ func TestLZ4RawLargerFileRead(t *testing.T) {
        }
        require.Equal(t, expectedValsHead, vals[:len(expectedValsHead)])
 }
+
+func TestDeltaByteArray(t *testing.T) {
+       dir := os.Getenv("PARQUET_TEST_DATA")
+       if dir == "" {
+               t.Skip("no path supplied with PARQUET_TEST_DATA")
+       }
+       require.DirExists(t, dir)
+
+       expected, err := os.ReadFile(path.Join(dir, 
"delta_byte_array_expect.csv"))
+       require.NoError(t, err)
+       csvReader := csv.NewReader(bytes.NewReader(expected))
+
+       records, err := csvReader.ReadAll()
+       require.NoError(t, err)
+
+       records = records[1:] // skip header
+
+       props := parquet.NewReaderProperties(memory.DefaultAllocator)
+       fileReader, err := file.OpenParquetFile(path.Join(dir, 
"delta_byte_array.parquet"),
+               false, file.WithReadProps(props))
+       require.NoError(t, err)
+       defer fileReader.Close()
+
+       nrows := fileReader.MetaData().NumRows
+       assert.Equal(t, nrows, int64(len(records)), "expected %d rows, got %d", 
len(records), nrows)
+
+       arrowReader, err := pqarrow.NewFileReader(
+               fileReader,
+               pqarrow.ArrowReadProperties{BatchSize: 1024},
+               memory.DefaultAllocator,
+       )
+       require.NoError(t, err)
+
+       rr, err := arrowReader.GetRecordReader(context.Background(), nil, nil)
+       require.NoError(t, err)
+       defer rr.Release()
+
+       for rr.Next() {
+               rec := rr.Record()
+               for i := range int(rec.NumCols()) {
+                       vals := rec.Column(i)
+                       for j := range vals.Len() {
+                               if vals.IsNull(j) {
+                                       require.Equal(t, records[j][i], "")
+                                       continue
+                               }
+                               require.Equal(t, records[j][i], 
vals.ValueStr(j))
+                       }
+               }
+       }
+}
diff --git a/parquet/internal/encoding/delta_bit_packing.go 
b/parquet/internal/encoding/delta_bit_packing.go
index a57b12c..9f43fc8 100644
--- a/parquet/internal/encoding/delta_bit_packing.go
+++ b/parquet/internal/encoding/delta_bit_packing.go
@@ -101,6 +101,7 @@ func (d *deltaBitPackDecoder[T]) SetData(nvalues int, data 
[]byte) error {
 
        d.valsPerMini = uint32(d.blockSize / d.miniBlocksPerBlock)
        d.usedFirst = false
+       d.nvals = int(d.totalValues)
        return nil
 }
 
diff --git a/parquet/internal/encoding/delta_byte_array.go 
b/parquet/internal/encoding/delta_byte_array.go
index bb2134a..580c83d 100644
--- a/parquet/internal/encoding/delta_byte_array.go
+++ b/parquet/internal/encoding/delta_byte_array.go
@@ -171,7 +171,7 @@ func (d *DeltaByteArrayDecoder) SetData(nvalues int, data 
[]byte) error {
                return err
        }
 
-       d.prefixLengths = make([]int32, nvalues)
+       d.prefixLengths = make([]int32, prefixLenDec.ValuesLeft())
        // decode all the prefix lengths first so we know how many bytes it 
took to get the
        // prefix lengths for nvalues
        prefixLenDec.Decode(d.prefixLengths)

Reply via email to