pitrou commented on code in PR #13770:
URL: https://github.com/apache/arrow/pull/13770#discussion_r935696173
##########
go/arrow/array/list.go:
##########
@@ -146,28 +146,177 @@ func (a *List) Release() {
a.values.Release()
}
-type ListBuilder struct {
+// LargeList represents an immutable sequence of array values.
+type LargeList struct {
+ array
+ values arrow.Array
+ offsets []int64
+}
+
+// NewLargeListData returns a new LargeList array value, from data.
+func NewLargeListData(data arrow.ArrayData) *LargeList {
+ a := &LargeList{}
+ a.refCount = 1
+ a.setData(data.(*Data))
+ return a
+}
+
+func (a *LargeList) ListValues() arrow.Array { return a.values }
+
+func (a *LargeList) String() string {
+ o := new(strings.Builder)
+ o.WriteString("[")
+ for i := 0; i < a.Len(); i++ {
+ if i > 0 {
+ o.WriteString(" ")
+ }
+ if !a.IsValid(i) {
+ o.WriteString("(null)")
+ continue
+ }
+ sub := a.newListValue(i)
+ fmt.Fprintf(o, "%v", sub)
+ sub.Release()
+ }
+ o.WriteString("]")
+ return o.String()
+}
+
+func (a *LargeList) newListValue(i int) arrow.Array {
+ j := i + a.array.data.offset
+ beg := int64(a.offsets[j])
+ end := int64(a.offsets[j+1])
+ return NewSlice(a.values, beg, end)
+}
+
+func (a *LargeList) setData(data *Data) {
+ a.array.setData(data)
+ vals := data.buffers[1]
+ if vals != nil {
+ a.offsets = arrow.Int64Traits.CastFromBytes(vals.Bytes())
+ }
+ a.values = MakeFromData(data.childData[0])
+}
+
+func (a *LargeList) getOneForMarshal(i int) interface{} {
+ if a.IsNull(i) {
+ return nil
+ }
+
+ slice := a.newListValue(i)
+ defer slice.Release()
+ v, err := json.Marshal(slice)
+ if err != nil {
+ panic(err)
+ }
+ return json.RawMessage(v)
+}
+
+func (a *LargeList) MarshalJSON() ([]byte, error) {
+ var buf bytes.Buffer
+ enc := json.NewEncoder(&buf)
+
+ buf.WriteByte('[')
+ for i := 0; i < a.Len(); i++ {
+ if i != 0 {
+ buf.WriteByte(',')
+ }
+ if err := enc.Encode(a.getOneForMarshal(i)); err != nil {
+ return nil, err
+ }
+ }
+ buf.WriteByte(']')
+ return buf.Bytes(), nil
+}
+
+func arrayEqualLargeList(left, right *LargeList) bool {
+ for i := 0; i < left.Len(); i++ {
+ if left.IsNull(i) {
+ continue
+ }
+ o := func() bool {
+ l := left.newListValue(i)
+ defer l.Release()
+ r := right.newListValue(i)
+ defer r.Release()
+ return Equal(l, r)
+ }()
+ if !o {
+ return false
+ }
+ }
+ return true
+}
+
+// Len returns the number of elements in the array.
+func (a *LargeList) Len() int { return a.array.Len() }
+
+func (a *LargeList) Offsets() []int64 { return a.offsets }
+
+func (a *LargeList) Retain() {
+ a.array.Retain()
+ a.values.Retain()
+}
+
+func (a *LargeList) Release() {
+ a.array.Release()
+ a.values.Release()
+}
+
+type listBuilder struct {
builder
etype arrow.DataType // data type of the list's elements.
values Builder // value builder for the list's elements.
- offsets *Int32Builder
+ offsets Builder
+
+ dt arrow.DataType
Review Comment:
Is it the actual list type or the offset type? (perhaps add a comment?)
##########
go/arrow/ipc/writer.go:
##########
@@ -562,6 +562,36 @@ func (w *recordEncoder) visit(p *Payload, arr arrow.Array)
error {
p.body = append(p.body, voffsets)
p.body = append(p.body, values)
+ case *arrow.LargeBinaryType:
Review Comment:
Isn't this exactly the same code as for BinaryType above? Can this perhaps
be mutualized?
##########
go/arrow/array/list.go:
##########
@@ -146,28 +146,177 @@ func (a *List) Release() {
a.values.Release()
}
-type ListBuilder struct {
+// LargeList represents an immutable sequence of array values.
+type LargeList struct {
+ array
+ values arrow.Array
+ offsets []int64
+}
+
+// NewLargeListData returns a new LargeList array value, from data.
+func NewLargeListData(data arrow.ArrayData) *LargeList {
+ a := &LargeList{}
+ a.refCount = 1
+ a.setData(data.(*Data))
+ return a
+}
+
+func (a *LargeList) ListValues() arrow.Array { return a.values }
+
+func (a *LargeList) String() string {
+ o := new(strings.Builder)
+ o.WriteString("[")
+ for i := 0; i < a.Len(); i++ {
+ if i > 0 {
+ o.WriteString(" ")
+ }
+ if !a.IsValid(i) {
+ o.WriteString("(null)")
+ continue
+ }
+ sub := a.newListValue(i)
+ fmt.Fprintf(o, "%v", sub)
+ sub.Release()
+ }
+ o.WriteString("]")
+ return o.String()
+}
+
+func (a *LargeList) newListValue(i int) arrow.Array {
+ j := i + a.array.data.offset
+ beg := int64(a.offsets[j])
+ end := int64(a.offsets[j+1])
+ return NewSlice(a.values, beg, end)
+}
+
+func (a *LargeList) setData(data *Data) {
+ a.array.setData(data)
+ vals := data.buffers[1]
+ if vals != nil {
+ a.offsets = arrow.Int64Traits.CastFromBytes(vals.Bytes())
+ }
+ a.values = MakeFromData(data.childData[0])
+}
+
+func (a *LargeList) getOneForMarshal(i int) interface{} {
+ if a.IsNull(i) {
+ return nil
+ }
+
+ slice := a.newListValue(i)
+ defer slice.Release()
+ v, err := json.Marshal(slice)
+ if err != nil {
+ panic(err)
+ }
+ return json.RawMessage(v)
+}
+
+func (a *LargeList) MarshalJSON() ([]byte, error) {
+ var buf bytes.Buffer
+ enc := json.NewEncoder(&buf)
+
+ buf.WriteByte('[')
+ for i := 0; i < a.Len(); i++ {
+ if i != 0 {
+ buf.WriteByte(',')
+ }
+ if err := enc.Encode(a.getOneForMarshal(i)); err != nil {
+ return nil, err
+ }
+ }
+ buf.WriteByte(']')
+ return buf.Bytes(), nil
+}
+
+func arrayEqualLargeList(left, right *LargeList) bool {
+ for i := 0; i < left.Len(); i++ {
+ if left.IsNull(i) {
+ continue
+ }
+ o := func() bool {
+ l := left.newListValue(i)
+ defer l.Release()
+ r := right.newListValue(i)
+ defer r.Release()
+ return Equal(l, r)
+ }()
+ if !o {
+ return false
+ }
+ }
+ return true
+}
+
+// Len returns the number of elements in the array.
+func (a *LargeList) Len() int { return a.array.Len() }
+
+func (a *LargeList) Offsets() []int64 { return a.offsets }
+
+func (a *LargeList) Retain() {
+ a.array.Retain()
+ a.values.Retain()
+}
+
+func (a *LargeList) Release() {
+ a.array.Release()
+ a.values.Release()
+}
+
+type listBuilder struct {
Review Comment:
I don't know how the Go culture goes about this, but isn't it a bit
confusing to have both `listBuilder` and `ListBuilder` classes here?
##########
go/arrow/cdata/cdata.go:
##########
@@ -433,7 +470,8 @@ func (imp *cimporter) importListLike() error {
return err
}
- offsets := imp.importOffsetsBuffer(1)
+ offsetSize := imp.dt.Layout().Buffers[1].ByteWidth
Review Comment:
Can the `ByteWidth` trick also be used for String/LargeString instead of
having duplicate import routines?
##########
go/arrow/array/list.go:
##########
@@ -181,29 +330,36 @@ func (b *ListBuilder) Release() {
b.offsets.Release()
}
-func (b *ListBuilder) appendNextOffset() {
- b.offsets.Append(int32(b.values.Len()))
+func (b *listBuilder) appendNextOffset() {
+ b.appendOffsetVal(b.values.Len())
Review Comment:
So this is going through a function pointer and/or vtable I assume? I don't
know how much optimized you expect builders to be, but is some inlining
automatically done by the compiler here when the derived listBuilder type is
known?
##########
go/arrow/array/concat_test.go:
##########
@@ -158,6 +159,32 @@ func (cts *ConcatTestSuite) generateArr(size int64,
nullprob float64) arrow.Arra
bldr := array.NewListBuilder(memory.DefaultAllocator,
arrow.PrimitiveTypes.Int8)
defer bldr.Release()
+ valid := make([]bool, len(offsetsVector)-1)
+ for i := range valid {
+ valid[i] = true
+ }
+ bldr.AppendValues(offsetsVector, valid)
+ vb := bldr.ValueBuilder().(*array.Int8Builder)
+ for i := 0; i < values.Len(); i++ {
+ if values.IsValid(i) {
+ vb.Append(values.Value(i))
+ } else {
+ vb.AppendNull()
+ }
+ }
+ return bldr.NewArray()
+ case arrow.LARGE_LIST:
+ valuesSize := size * 8
Review Comment:
Hmm... so, the way I understand this code, `size` is the logical length of
the list array and `valuesSize` the logical length of the child list array,
right?
Meaning that the multiplier here is arbitrary and you could have e.g.
`valuesSize := size * 13` or something?
##########
go/arrow/ipc/writer.go:
##########
@@ -719,13 +817,26 @@ func (w *recordEncoder) getZeroBasedValueOffsets(arr
arrow.Array) (*memory.Buffe
shiftedOffsets := memory.NewResizableBuffer(w.mem)
shiftedOffsets.Resize(offsetBytesNeeded)
- dest := arrow.Int32Traits.CastFromBytes(shiftedOffsets.Bytes())
- offsets :=
arrow.Int32Traits.CastFromBytes(voffsets.Bytes())[data.Offset() :
data.Offset()+data.Len()+1]
+ switch arr.DataType().Layout().Buffers[1].ByteWidth {
+ case 8:
+ dest :=
arrow.Int64Traits.CastFromBytes(shiftedOffsets.Bytes())
+ offsets :=
arrow.Int64Traits.CastFromBytes(voffsets.Bytes())[data.Offset() :
data.Offset()+data.Len()+1]
- startOffset := offsets[0]
- for i, o := range offsets {
- dest[i] = o - startOffset
+ startOffset := offsets[0]
+ for i, o := range offsets {
+ dest[i] = o - startOffset
+ }
+
+ default:
Review Comment:
Perhaps add a debug mode assertion that it has the value `4` here?
##########
go/arrow/datatype_binary.go:
##########
@@ -16,6 +16,10 @@
package arrow
+type OffsetTraits interface {
+ BytesRequired(int) int
Review Comment:
Should this get a docstring?
##########
go/arrow/array/list_test.go:
##########
@@ -211,3 +211,190 @@ func TestListArraySlice(t *testing.T) {
t.Fatalf("got=%q, want=%q", got, want)
}
}
+
+func TestLargeListArray(t *testing.T) {
+ pool := memory.NewCheckedAllocator(memory.NewGoAllocator())
+ defer pool.AssertSize(t, 0)
+
+ var (
+ vs = []int32{0, 1, 2, 3, 4, 5, 6}
+ lengths = []int{3, 0, 4}
+ isValid = []bool{true, false, true}
+ offsets = []int64{0, 3, 3, 7}
Review Comment:
For the record, perhaps it would be nice to also test a zero-length non-null
list element as well? e.g.:
```suggestion
lengths = []int{3, 0, 0, 4}
isValid = []bool{true, false, true, true}
offsets = []int64{0, 3, 3, 3, 7}
```
##########
go/arrow/array/list_test.go:
##########
@@ -211,3 +211,190 @@ func TestListArraySlice(t *testing.T) {
t.Fatalf("got=%q, want=%q", got, want)
}
}
+
+func TestLargeListArray(t *testing.T) {
+ pool := memory.NewCheckedAllocator(memory.NewGoAllocator())
+ defer pool.AssertSize(t, 0)
+
+ var (
+ vs = []int32{0, 1, 2, 3, 4, 5, 6}
+ lengths = []int{3, 0, 4}
+ isValid = []bool{true, false, true}
+ offsets = []int64{0, 3, 3, 7}
+ )
+
+ lb := array.NewLargeListBuilder(pool, arrow.PrimitiveTypes.Int32)
+ defer lb.Release()
+
+ for i := 0; i < 10; i++ {
+ vb := lb.ValueBuilder().(*array.Int32Builder)
+ vb.Reserve(len(vs))
+
+ pos := 0
+ for i, length := range lengths {
+ lb.Append(isValid[i])
+ for j := 0; j < length; j++ {
+ vb.Append(vs[pos])
+ pos++
+ }
+ }
+
+ arr := lb.NewArray().(*array.LargeList)
+ defer arr.Release()
+
+ arr.Retain()
+ arr.Release()
+
+ if got, want := arr.DataType().ID(), arrow.LARGE_LIST; got !=
want {
+ t.Fatalf("got=%v, want=%v", got, want)
Review Comment:
Just for the record, doesn't Go have a way to simplify this boilerplate? In
C++ or Python, we would use custom assertions (e.g. "ASSERT_EQ") provided by
the testing framework to reduce this to a simple one-liner and yet get good
error diagnostics.
##########
go/arrow/array/list.go:
##########
@@ -265,19 +427,31 @@ func (b *ListBuilder) NewListArray() (a *List) {
return
}
-func (b *ListBuilder) newData() (data *Data) {
+// NewListArray creates a List array from the memory buffers used by the
builder and resets the ListBuilder
Review Comment:
```suggestion
// NewLargeListArray creates a List array from the memory buffers used by
the builder and resets the LargeListBuilder
```
##########
go/arrow/cdata/cdata.go:
##########
@@ -513,14 +551,13 @@ func (imp *cimporter) importFixedSizeBuffer(bufferID int,
byteWidth int64) *memo
return imp.importBuffer(bufferID, bufsize)
}
-func (imp *cimporter) importOffsetsBuffer(bufferID int) *memory.Buffer {
- const offsetsize = int64(arrow.Int32SizeBytes) // go doesn't implement
int64 offsets yet
+func (imp *cimporter) importOffsetsBuffer(bufferID int, offsetsize int64)
*memory.Buffer {
bufsize := offsetsize * int64((imp.arr.length + imp.arr.offset + 1))
return imp.importBuffer(bufferID, bufsize)
}
-func (imp *cimporter) importVariableValuesBuffer(bufferID int, byteWidth int,
offsets []int32) *memory.Buffer {
- bufsize := byteWidth * int(offsets[imp.arr.length])
+func (imp *cimporter) importVariableValuesBuffer(bufferID int, byteWidth int,
bytelen int) *memory.Buffer {
+ bufsize := byteWidth * bytelen
Review Comment:
Two things: 1) isn't `bytelen` a bit misleading as a name? 2) shouldn't it
be an `int64`?
##########
go/arrow/array/list_test.go:
##########
@@ -211,3 +211,190 @@ func TestListArraySlice(t *testing.T) {
t.Fatalf("got=%q, want=%q", got, want)
}
}
+
+func TestLargeListArray(t *testing.T) {
+ pool := memory.NewCheckedAllocator(memory.NewGoAllocator())
+ defer pool.AssertSize(t, 0)
+
+ var (
+ vs = []int32{0, 1, 2, 3, 4, 5, 6}
+ lengths = []int{3, 0, 4}
+ isValid = []bool{true, false, true}
+ offsets = []int64{0, 3, 3, 7}
+ )
+
+ lb := array.NewLargeListBuilder(pool, arrow.PrimitiveTypes.Int32)
+ defer lb.Release()
+
+ for i := 0; i < 10; i++ {
Review Comment:
Just for my understanding, why are we looping 10 times over the test body
below? Is it to try and detect memory management issues?
##########
go/arrow/array/list.go:
##########
@@ -253,6 +409,12 @@ func (b *ListBuilder) NewArray() arrow.Array {
return b.NewListArray()
}
+// NewArray creates a List array from the memory buffers used by the builder
and resets the ListBuilder
Review Comment:
Do you want to be specific here?
```suggestion
// NewArray creates a LargeList array from the memory buffers used by the
builder and resets the LargeListBuilder
```
##########
docs/source/status.rst:
##########
@@ -77,7 +77,7 @@ Data Types
+-------------------+-------+-------+-------+------------+-------+-------+-------+
| List | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓
|
+-------------------+-------+-------+-------+------------+-------+-------+-------+
-| Large List | ✓ | ✓ | | | | ✓ | ✓
|
+| Large List | ✓ | ✓ | ✓ | | | ✓ | ✓
|
Review Comment:
Is the PR title incorrect? This is updating the compatibility matrix for
Large List support, not Large String and Large Binary which are already ticked
for Go.
##########
go/arrow/ipc/writer.go:
##########
@@ -562,6 +562,36 @@ func (w *recordEncoder) visit(p *Payload, arr arrow.Array)
error {
p.body = append(p.body, voffsets)
p.body = append(p.body, values)
+ case *arrow.LargeBinaryType:
Review Comment:
Incidentally, same question for String/LargeString and List/LargeList
##########
go/arrow/datatype_binary.go:
##########
@@ -16,6 +16,10 @@
package arrow
+type OffsetTraits interface {
+ BytesRequired(int) int
Review Comment:
Also, does this need to take and return `int64` instead?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]