[
https://issues.apache.org/jira/browse/ARROW-18309?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17634106#comment-17634106
]
jun wang edited comment on ARROW-18309 at 11/15/22 2:49 AM:
------------------------------------------------------------
I tested Arrow V10, it panic now, and the ValuesLeft() is broken. Here is the
code I used
{code:java}
func TestDeltaBitPacking(t *testing.T) {
f, err := os.Open("timestamp.data")
if err != nil {
t.Fatal(err)
}
defer f.Close()
values := make([]int64, 0)
scanner := bufio.NewScanner(f)
for scanner.Scan() {
v, err := strconv.ParseInt(scanner.Text(), 10, 64)
if err != nil {
t.Fatal(err)
}
values = append(values, v)
}
if err := scanner.Err(); err != nil {
t.Fatal(err)
}
col := schema.NewColumn(schema.MustPrimitive(schema.NewPrimitiveNode("foo",
parquet.Repetitions.Required,
parquet.Types.Int64, -1, -1)), 0, 0)
enc := encoding.NewEncoder(parquet.Types.Int64,
parquet.Encodings.DeltaBinaryPacked, false, col,
memory.DefaultAllocator).(encoding.Int64Encoder)
enc.Put(values)
buf, err := enc.FlushValues()
if err != nil {
t.Fatal(err)
}
defer buf.Release()
dec := encoding.NewDecoder(parquet.Types.Int64,
parquet.Encodings.DeltaBinaryPacked, col,
memory.DefaultAllocator).(encoding.Int64Decoder)
dec.SetData(len(values), buf.Bytes())
ll := len(values)
for i := 0; i < ll; i += 1024 {
out := make([]int64, 1024)
n, err := dec.Decode(out)
if err != nil {
t.Fatal(err)
}
assert.Equal(t, values[:n], out)
values = values[n:]
assert.Equal(t, dec.ValuesLeft(), len(values))
}
}
{code}
was (Author: JIRAUSER298350):
I tested Arrow V10, it would not panic now, but the ValuesLeft() is broken.
Here is the code I used
{code:java}
func TestDeltaBitPacking(t *testing.T) {
f, err := os.Open("timestamp.data")
if err != nil {
t.Fatal(err)
}
defer f.Close()
values := make([]int64, 0)
scanner := bufio.NewScanner(f)
for scanner.Scan() {
v, err := strconv.ParseInt(scanner.Text(), 10, 64)
if err != nil {
t.Fatal(err)
}
values = append(values, v)
}
if err := scanner.Err(); err != nil {
t.Fatal(err)
}
col := schema.NewColumn(schema.MustPrimitive(schema.NewPrimitiveNode("foo",
parquet.Repetitions.Required,
parquet.Types.Int64, -1, -1)), 0, 0)
enc := encoding.NewEncoder(parquet.Types.Int64,
parquet.Encodings.DeltaBinaryPacked, false, col,
memory.DefaultAllocator).(encoding.Int64Encoder)
enc.Put(values)
buf, err := enc.FlushValues()
if err != nil {
t.Fatal(err)
}
defer buf.Release()
dec := encoding.NewDecoder(parquet.Types.Int64,
parquet.Encodings.DeltaBinaryPacked, col,
memory.DefaultAllocator).(encoding.Int64Decoder)
dec.SetData(len(values), buf.Bytes())
for i := 0; i < len(values); i += 1024 {
out := make([]int64, 1024)
n, err := dec.Decode(out)
if err != nil {
t.Fatal(err)
}
assert.Equal(t, values[:n], out)
values = values[n:]
assert.Equal(t, dec.ValuesLeft(), len(values))
}
}
{code}
> [Go] delta_bit_packing Decode may panic
> ---------------------------------------
>
> Key: ARROW-18309
> URL: https://issues.apache.org/jira/browse/ARROW-18309
> Project: Apache Arrow
> Issue Type: Bug
> Components: Go
> Affects Versions: 9.0.0
> Environment: all release version
> Reporter: jun wang
> Assignee: Matthew Topol
> Priority: Major
> Fix For: 9.0.1
>
> Attachments: @timestamp.data
>
>
> [https://github.com/apache/arrow/blob/master/go/parquet/internal/encoding/delta_bit_packing.go]
> The DeltaBitPackInt32 and DeltaBitPackInt64 Decode method did not use
> d.nvals subtract decoded number at end, which lead streaming decode panic.
> Also, when copy the decoded value to out, the end value should be
> shared_utils.MinInt(int(d.valsPerMini), start + len(out))
> When encode 68610 timestamp data, and decode 1024 value a batch, we encounter
> the panic
--
This message was sent by Atlassian Jira
(v8.20.10#820010)