[GitHub] [arrow] westonpace commented on a diff in pull request #14146: ARROW-17750: [Go] Kernel Input Type for RLE

GitBox Thu, 22 Sep 2022 11:56:14 -0700


westonpace commented on code in PR #14146:
URL: https://github.com/apache/arrow/pull/14146#discussion_r977973894



##########
go/arrow/array/concat.go:
##########
@@ -42,7 +43,12 @@ func Concatenate(arrs []arrow.Array, mem memory.Allocator) 
(result arrow.Array,
 
        defer func() {
                if pErr := recover(); pErr != nil {
-                       err = fmt.Errorf("arrow/concat: unknown error: %v", 
pErr)
+                       switch e := pErr.(type) {
+                       case error:

Review Comment:
   I don't really know go but why is it important to distinguish between these 
two error types?  Adding `unknown error` doesn't seem to contribute too much to 
a user's understanding of the error.



##########
go/arrow/array/encoded.go:
##########
@@ -0,0 +1,304 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package array
+
+import (
+       "bytes"
+       "fmt"
+       "math"
+       "sync/atomic"
+
+       "github.com/apache/arrow/go/v10/arrow"
+       "github.com/apache/arrow/go/v10/arrow/internal/debug"
+       "github.com/apache/arrow/go/v10/arrow/memory"
+       "github.com/apache/arrow/go/v10/arrow/rle"
+       "github.com/goccy/go-json"
+)
+
+// RunLengthEncoded represents an array containing two children:
+// an array of int32 values defining the ends of each run of values
+// and an array of values
+type RunLengthEncoded struct {
+       array
+
+       runEnds []int32
+       ends    arrow.Array
+       values  arrow.Array
+}
+
+func NewRunLengthEncodedArray(runEnds, values arrow.Array, logicalLength, 
offset int) *RunLengthEncoded {
+       data := NewData(arrow.RunLengthEncodedOf(values.DataType()), 
logicalLength,
+               []*memory.Buffer{nil}, []arrow.ArrayData{runEnds.Data(), 
values.Data()}, 0, offset)
+       defer data.Release()
+       return NewRunLengthEncodedData(data)
+}
+
+func NewRunLengthEncodedData(data arrow.ArrayData) *RunLengthEncoded {
+       r := &RunLengthEncoded{}
+       r.refCount = 1
+       r.setData(data.(*Data))
+       return r
+}
+
+func (r *RunLengthEncoded) RunEnds() []int32        { return r.runEnds }
+func (r *RunLengthEncoded) Values() arrow.Array     { return r.values }
+func (r *RunLengthEncoded) RunEndsArr() arrow.Array { return r.ends }
+
+func (r *RunLengthEncoded) Retain() {
+       r.array.Retain()
+       r.values.Retain()
+       r.ends.Retain()
+}
+
+func (r *RunLengthEncoded) Release() {
+       r.array.Release()
+       r.values.Release()
+       r.ends.Release()
+}
+
+func (r *RunLengthEncoded) setData(data *Data) {
+       if len(data.childData) != 2 {
+               panic(fmt.Errorf("%w: arrow/array: RLE array must have exactly 
2 children", arrow.ErrInvalid))
+       }
+       if data.childData[0].DataType().ID() != arrow.INT32 {
+               panic(fmt.Errorf("%w: arrow/array: run ends array must be 
int32", arrow.ErrInvalid))
+       }
+       if data.childData[0].NullN() > 0 {
+               panic(fmt.Errorf("%w: arrow/array: run ends array cannot 
contain nulls", arrow.ErrInvalid))
+       }
+
+       debug.Assert(data.dtype.ID() == arrow.RUN_LENGTH_ENCODED, "invalid type 
for RunLengthEncoded")
+       r.array.setData(data)
+
+       if r.data.childData[0].Buffers()[1] != nil {
+               r.runEnds = 
arrow.Int32Traits.CastFromBytes(r.data.childData[0].Buffers()[1].Bytes())
+       }
+       r.ends = MakeFromData(r.data.childData[0])
+       r.values = MakeFromData(r.data.childData[1])
+}
+
+func (r *RunLengthEncoded) GetPhysicalOffset() int {
+       return rle.FindPhysicalOffset(r.runEnds, r.data.offset)
+}
+
+func (r *RunLengthEncoded) GetPhysicalLength() int {
+       if r.data.length == 0 {
+               return 0
+       }
+
+       physicalOffset := r.GetPhysicalOffset()
+       return rle.FindPhysicalOffset(r.runEnds[physicalOffset:],
+               r.data.offset+r.data.length-1) + 1
+}
+
+func (r *RunLengthEncoded) String() string {
+       var buf bytes.Buffer
+       buf.WriteByte('[')
+       for i, runEnd := range r.runEnds {
+               if i != 0 {
+                       buf.WriteByte(',')
+               }
+
+               fmt.Fprintf(&buf, "{%d -> %v}", runEnd, 
r.values.(arraymarshal).getOneForMarshal(i))
+       }
+       buf.WriteByte(']')
+       return buf.String()
+}
+
+func (r *RunLengthEncoded) getOneForMarshal(i int) interface{} {
+       return [2]interface{}{r.runEnds[i], 
r.values.(arraymarshal).getOneForMarshal(i)}
+}
+
+func (r *RunLengthEncoded) MarshalJSON() ([]byte, error) {
+       var buf bytes.Buffer
+       enc := json.NewEncoder(&buf)
+       buf.WriteByte('[')
+
+       for i := range r.runEnds {
+               if i != 0 {
+                       buf.WriteByte(',')
+               }
+               if err := enc.Encode(r.getOneForMarshal(i)); err != nil {
+                       return nil, err
+               }
+       }
+       buf.WriteByte(']')
+       return buf.Bytes(), nil
+}
+
+func arrayRunLengthEncodedEqual(l, r *RunLengthEncoded) bool {
+       // types were already checked before getting here, so we know
+       // the encoded types are equal
+       mr := rle.NewMergedRuns([2]arrow.Array{l, r})
+       for mr.Next() {
+               lIndex := mr.IndexIntoArray(0)
+               rIndex := mr.IndexIntoArray(1)
+               if !SliceEqual(l.values, lIndex, lIndex+1, r.values, rIndex, 
rIndex+1) {
+                       return false
+               }
+       }
+       return true
+}
+
+func arrayRunLengthEncodedApproxEqual(l, r *RunLengthEncoded, opt equalOption) 
bool {
+       // types were already checked before getting here, so we know
+       // the encoded types are equal
+       mr := rle.NewMergedRuns([2]arrow.Array{l, r})
+       for mr.Next() {
+               lIndex := mr.IndexIntoArray(0)
+               rIndex := mr.IndexIntoArray(1)
+               if !sliceApproxEqual(l.values, lIndex, lIndex+1, r.values, 
rIndex, rIndex+1, opt) {
+                       return false
+               }
+       }
+       return true
+
+}
+
+type RunLengthEncodedBuilder struct {
+       builder
+
+       dt      arrow.DataType
+       runEnds *Int32Builder
+       values  Builder
+}
+
+func NewRunLengthEncodedBuilder(mem memory.Allocator, typ arrow.DataType) 
*RunLengthEncodedBuilder {
+       return &RunLengthEncodedBuilder{
+               builder: builder{refCount: 1, mem: mem},
+               dt:      arrow.RunLengthEncodedOf(typ),
+               runEnds: NewInt32Builder(mem),
+               values:  NewBuilder(mem, typ),
+       }
+}
+
+func (b *RunLengthEncodedBuilder) Type() arrow.DataType {
+       return b.dt
+}
+
+func (b *RunLengthEncodedBuilder) Release() {
+       debug.Assert(atomic.LoadInt64(&b.refCount) > 0, "too many releases")
+
+       if atomic.AddInt64(&b.refCount, -1) == 0 {
+               b.values.Release()
+               b.runEnds.Release()
+       }
+}
+
+func (b *RunLengthEncodedBuilder) addLength(n uint32) {
+       if b.length+int(n) > math.MaxInt32 {
+               panic(fmt.Errorf("%w: run-length encoded array length must fit 
in a 32-bit signed integer", arrow.ErrInvalid))
+       }
+
+       b.length += int(n)
+}
+
+func (b *RunLengthEncodedBuilder) finishRun() {
+       if b.length == 0 {
+               return
+       }
+
+       b.runEnds.Append(int32(b.length))
+}
+
+func (b *RunLengthEncodedBuilder) ValueBuilder() Builder { return b.values }
+func (b *RunLengthEncodedBuilder) Append(n uint32) {
+       b.finishRun()
+       b.addLength(n)
+}
+func (b *RunLengthEncodedBuilder) ContinueRun(n uint32) {
+       b.addLength(n)
+}
+func (b *RunLengthEncodedBuilder) AppendNull() {
+       b.finishRun()
+       b.values.AppendNull()
+       b.addLength(1)
+}
+
+func (b *RunLengthEncodedBuilder) NullN() int {
+       return UnknownNullCount
+}
+
+func (b *RunLengthEncodedBuilder) AppendEmptyValue() {
+       b.AppendNull()
+}
+
+func (b *RunLengthEncodedBuilder) Reserve(n int) {
+       b.values.Reserve(n)
+       b.runEnds.Reserve(n)
+}
+
+func (b *RunLengthEncodedBuilder) Resize(n int) {
+       b.values.Resize(n)
+       b.runEnds.Resize(n)
+}

Review Comment:
   Do you not need to resize/reserve the validity map?



##########
go/arrow/array/concat_test.go:
##########
@@ -568,3 +569,59 @@ func TestConcatDictionaryNullSlots(t *testing.T) {
 
        assert.Truef(t, array.Equal(actual, expected), "got: %s, expected: %s", 
actual, expected)
 }
+
+func TestConcatRunLengthEncoded(t *testing.T) {

Review Comment:
   Do you want any tests for the overflow cases?



##########
go/arrow/rle/rle_utils_test.go:
##########
@@ -0,0 +1,144 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package rle_test
+
+import (
+       "fmt"
+       "strings"
+       "testing"
+
+       "github.com/apache/arrow/go/v10/arrow"
+       "github.com/apache/arrow/go/v10/arrow/array"
+       "github.com/apache/arrow/go/v10/arrow/memory"
+       "github.com/apache/arrow/go/v10/arrow/rle"
+       "github.com/stretchr/testify/assert"
+)
+
+func TestFindPhysicalOffset(t *testing.T) {
+       tests := []struct {
+               vals []int32
+               find int
+               exp  int
+       }{
+               {[]int32{1}, 0, 0},
+               {[]int32{1, 2, 3}, 0, 0},
+               {[]int32{1, 2, 3}, 1, 1},
+               {[]int32{1, 2, 3}, 2, 2},
+               {[]int32{2, 3, 4}, 0, 0},
+               {[]int32{2, 3, 4}, 1, 0},
+               {[]int32{2, 3, 4}, 2, 1},
+               {[]int32{2, 3, 4}, 3, 2},
+               {[]int32{2, 4, 6}, 3, 1},
+               {[]int32{1, 2, 3, 4, 5, 6, 7, 8, 9, 1000, 1005, 1015, 1020, 
1025, 1050}, 1000, 10},
+               // out-of-range logical offset should return len(vals)
+               {[]int32{2, 4, 6}, 6, 3},
+               {[]int32{2, 4, 6}, 10000, 3},

Review Comment:
   What about out-of-range in the negative direction?  Error?



##########
go/arrow/array/array.go:
##########
@@ -175,8 +175,8 @@ func init() {
                arrow.LARGE_STRING:            func(data arrow.ArrayData) 
arrow.Array { return NewLargeStringData(data) },
                arrow.LARGE_BINARY:            func(data arrow.ArrayData) 
arrow.Array { return NewLargeBinaryData(data) },
                arrow.LARGE_LIST:              func(data arrow.ArrayData) 
arrow.Array { return NewLargeListData(data) },
-               arrow.INTERVAL:                func(data arrow.ArrayData) 
arrow.Array { return NewIntervalData(data) },

Review Comment:
   I assume getting rid of interval is an intentional unrelated change and not 
a mistake?



##########
go/arrow/array/concat.go:
##########
@@ -512,6 +518,78 @@ func concat(data []arrow.ArrayData, mem memory.Allocator) 
(arrow.ArrayData, erro
                if err != nil {
                        return nil, err
                }
+       case *arrow.RunLengthEncodedType:
+               physicalLength, overflow := int32(0), false
+               // we can't use gatherChildren because the Offset and Len of
+               // data doesn't correspond to the physical length or offset
+               runs := make([]arrow.ArrayData, len(data))
+               values := make([]arrow.ArrayData, len(data))
+               for i, d := range data {
+                       plen := rle.GetPhysicalLength(d)
+                       off := rle.GetPhysicalOffset(d)
+
+                       runs[i] = NewSliceData(d.Children()[0], int64(off), 
int64(off+plen))
+                       defer runs[i].Release()
+                       values[i] = NewSliceData(d.Children()[1], int64(off), 
int64(off+plen))
+                       defer values[i].Release()
+
+                       physicalLength, overflow = addOvf32(physicalLength, 
int32(plen))
+                       if overflow {
+                               return nil, fmt.Errorf("%w: run length encoded 
array length must fit into a 32-bit signed integer",

Review Comment:
   Does go use 64-bit lengths for arrays?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [arrow] westonpace commented on a diff in pull request #14146: ARROW-17750: [Go] Kernel Input Type for RLE

Reply via email to