pitrou commented on code in PR #13770:
URL: https://github.com/apache/arrow/pull/13770#discussion_r935696173


##########
go/arrow/array/list.go:
##########
@@ -146,28 +146,177 @@ func (a *List) Release() {
        a.values.Release()
 }
 
-type ListBuilder struct {
+// LargeList represents an immutable sequence of array values.
+type LargeList struct {
+       array
+       values  arrow.Array
+       offsets []int64
+}
+
+// NewLargeListData returns a new LargeList array value, from data.
+func NewLargeListData(data arrow.ArrayData) *LargeList {
+       a := &LargeList{}
+       a.refCount = 1
+       a.setData(data.(*Data))
+       return a
+}
+
+func (a *LargeList) ListValues() arrow.Array { return a.values }
+
+func (a *LargeList) String() string {
+       o := new(strings.Builder)
+       o.WriteString("[")
+       for i := 0; i < a.Len(); i++ {
+               if i > 0 {
+                       o.WriteString(" ")
+               }
+               if !a.IsValid(i) {
+                       o.WriteString("(null)")
+                       continue
+               }
+               sub := a.newListValue(i)
+               fmt.Fprintf(o, "%v", sub)
+               sub.Release()
+       }
+       o.WriteString("]")
+       return o.String()
+}
+
+func (a *LargeList) newListValue(i int) arrow.Array {
+       j := i + a.array.data.offset
+       beg := int64(a.offsets[j])
+       end := int64(a.offsets[j+1])
+       return NewSlice(a.values, beg, end)
+}
+
+func (a *LargeList) setData(data *Data) {
+       a.array.setData(data)
+       vals := data.buffers[1]
+       if vals != nil {
+               a.offsets = arrow.Int64Traits.CastFromBytes(vals.Bytes())
+       }
+       a.values = MakeFromData(data.childData[0])
+}
+
+func (a *LargeList) getOneForMarshal(i int) interface{} {
+       if a.IsNull(i) {
+               return nil
+       }
+
+       slice := a.newListValue(i)
+       defer slice.Release()
+       v, err := json.Marshal(slice)
+       if err != nil {
+               panic(err)
+       }
+       return json.RawMessage(v)
+}
+
+func (a *LargeList) MarshalJSON() ([]byte, error) {
+       var buf bytes.Buffer
+       enc := json.NewEncoder(&buf)
+
+       buf.WriteByte('[')
+       for i := 0; i < a.Len(); i++ {
+               if i != 0 {
+                       buf.WriteByte(',')
+               }
+               if err := enc.Encode(a.getOneForMarshal(i)); err != nil {
+                       return nil, err
+               }
+       }
+       buf.WriteByte(']')
+       return buf.Bytes(), nil
+}
+
+func arrayEqualLargeList(left, right *LargeList) bool {
+       for i := 0; i < left.Len(); i++ {
+               if left.IsNull(i) {
+                       continue
+               }
+               o := func() bool {
+                       l := left.newListValue(i)
+                       defer l.Release()
+                       r := right.newListValue(i)
+                       defer r.Release()
+                       return Equal(l, r)
+               }()
+               if !o {
+                       return false
+               }
+       }
+       return true
+}
+
+// Len returns the number of elements in the array.
+func (a *LargeList) Len() int { return a.array.Len() }
+
+func (a *LargeList) Offsets() []int64 { return a.offsets }
+
+func (a *LargeList) Retain() {
+       a.array.Retain()
+       a.values.Retain()
+}
+
+func (a *LargeList) Release() {
+       a.array.Release()
+       a.values.Release()
+}
+
+type listBuilder struct {
        builder
 
        etype   arrow.DataType // data type of the list's elements.
        values  Builder        // value builder for the list's elements.
-       offsets *Int32Builder
+       offsets Builder
+
+       dt              arrow.DataType

Review Comment:
   Is it the actual list type or the offset type? (perhaps add a comment?)



##########
go/arrow/ipc/writer.go:
##########
@@ -562,6 +562,36 @@ func (w *recordEncoder) visit(p *Payload, arr arrow.Array) 
error {
                p.body = append(p.body, voffsets)
                p.body = append(p.body, values)
 
+       case *arrow.LargeBinaryType:

Review Comment:
   Isn't this exactly the same code as for BinaryType above? Can this perhaps 
be mutualized?



##########
go/arrow/array/list.go:
##########
@@ -146,28 +146,177 @@ func (a *List) Release() {
        a.values.Release()
 }
 
-type ListBuilder struct {
+// LargeList represents an immutable sequence of array values.
+type LargeList struct {
+       array
+       values  arrow.Array
+       offsets []int64
+}
+
+// NewLargeListData returns a new LargeList array value, from data.
+func NewLargeListData(data arrow.ArrayData) *LargeList {
+       a := &LargeList{}
+       a.refCount = 1
+       a.setData(data.(*Data))
+       return a
+}
+
+func (a *LargeList) ListValues() arrow.Array { return a.values }
+
+func (a *LargeList) String() string {
+       o := new(strings.Builder)
+       o.WriteString("[")
+       for i := 0; i < a.Len(); i++ {
+               if i > 0 {
+                       o.WriteString(" ")
+               }
+               if !a.IsValid(i) {
+                       o.WriteString("(null)")
+                       continue
+               }
+               sub := a.newListValue(i)
+               fmt.Fprintf(o, "%v", sub)
+               sub.Release()
+       }
+       o.WriteString("]")
+       return o.String()
+}
+
+func (a *LargeList) newListValue(i int) arrow.Array {
+       j := i + a.array.data.offset
+       beg := int64(a.offsets[j])
+       end := int64(a.offsets[j+1])
+       return NewSlice(a.values, beg, end)
+}
+
+func (a *LargeList) setData(data *Data) {
+       a.array.setData(data)
+       vals := data.buffers[1]
+       if vals != nil {
+               a.offsets = arrow.Int64Traits.CastFromBytes(vals.Bytes())
+       }
+       a.values = MakeFromData(data.childData[0])
+}
+
+func (a *LargeList) getOneForMarshal(i int) interface{} {
+       if a.IsNull(i) {
+               return nil
+       }
+
+       slice := a.newListValue(i)
+       defer slice.Release()
+       v, err := json.Marshal(slice)
+       if err != nil {
+               panic(err)
+       }
+       return json.RawMessage(v)
+}
+
+func (a *LargeList) MarshalJSON() ([]byte, error) {
+       var buf bytes.Buffer
+       enc := json.NewEncoder(&buf)
+
+       buf.WriteByte('[')
+       for i := 0; i < a.Len(); i++ {
+               if i != 0 {
+                       buf.WriteByte(',')
+               }
+               if err := enc.Encode(a.getOneForMarshal(i)); err != nil {
+                       return nil, err
+               }
+       }
+       buf.WriteByte(']')
+       return buf.Bytes(), nil
+}
+
+func arrayEqualLargeList(left, right *LargeList) bool {
+       for i := 0; i < left.Len(); i++ {
+               if left.IsNull(i) {
+                       continue
+               }
+               o := func() bool {
+                       l := left.newListValue(i)
+                       defer l.Release()
+                       r := right.newListValue(i)
+                       defer r.Release()
+                       return Equal(l, r)
+               }()
+               if !o {
+                       return false
+               }
+       }
+       return true
+}
+
+// Len returns the number of elements in the array.
+func (a *LargeList) Len() int { return a.array.Len() }
+
+func (a *LargeList) Offsets() []int64 { return a.offsets }
+
+func (a *LargeList) Retain() {
+       a.array.Retain()
+       a.values.Retain()
+}
+
+func (a *LargeList) Release() {
+       a.array.Release()
+       a.values.Release()
+}
+
+type listBuilder struct {

Review Comment:
   I don't know how the Go culture goes about this, but isn't it a bit 
confusing to have both `listBuilder` and `ListBuilder` classes here?



##########
go/arrow/cdata/cdata.go:
##########
@@ -433,7 +470,8 @@ func (imp *cimporter) importListLike() error {
                return err
        }
 
-       offsets := imp.importOffsetsBuffer(1)
+       offsetSize := imp.dt.Layout().Buffers[1].ByteWidth

Review Comment:
   Can the `ByteWidth` trick also be used for String/LargeString instead of 
having duplicate import routines?



##########
go/arrow/array/list.go:
##########
@@ -181,29 +330,36 @@ func (b *ListBuilder) Release() {
        b.offsets.Release()
 }
 
-func (b *ListBuilder) appendNextOffset() {
-       b.offsets.Append(int32(b.values.Len()))
+func (b *listBuilder) appendNextOffset() {
+       b.appendOffsetVal(b.values.Len())

Review Comment:
   So this is going through a function pointer and/or vtable I assume? I don't 
know how much optimized you expect builders to be, but is some inlining 
automatically done by the compiler here when the derived listBuilder type is 
known?



##########
go/arrow/array/concat_test.go:
##########
@@ -158,6 +159,32 @@ func (cts *ConcatTestSuite) generateArr(size int64, 
nullprob float64) arrow.Arra
                bldr := array.NewListBuilder(memory.DefaultAllocator, 
arrow.PrimitiveTypes.Int8)
                defer bldr.Release()
 
+               valid := make([]bool, len(offsetsVector)-1)
+               for i := range valid {
+                       valid[i] = true
+               }
+               bldr.AppendValues(offsetsVector, valid)
+               vb := bldr.ValueBuilder().(*array.Int8Builder)
+               for i := 0; i < values.Len(); i++ {
+                       if values.IsValid(i) {
+                               vb.Append(values.Value(i))
+                       } else {
+                               vb.AppendNull()
+                       }
+               }
+               return bldr.NewArray()
+       case arrow.LARGE_LIST:
+               valuesSize := size * 8

Review Comment:
   Hmm... so, the way I understand this code, `size` is the logical length of 
the list array and `valuesSize` the logical length of the child list array, 
right?
   
   Meaning that the multiplier here is arbitrary and you could have e.g. 
`valuesSize := size * 13` or something?



##########
go/arrow/ipc/writer.go:
##########
@@ -719,13 +817,26 @@ func (w *recordEncoder) getZeroBasedValueOffsets(arr 
arrow.Array) (*memory.Buffe
                shiftedOffsets := memory.NewResizableBuffer(w.mem)
                shiftedOffsets.Resize(offsetBytesNeeded)
 
-               dest := arrow.Int32Traits.CastFromBytes(shiftedOffsets.Bytes())
-               offsets := 
arrow.Int32Traits.CastFromBytes(voffsets.Bytes())[data.Offset() : 
data.Offset()+data.Len()+1]
+               switch arr.DataType().Layout().Buffers[1].ByteWidth {
+               case 8:
+                       dest := 
arrow.Int64Traits.CastFromBytes(shiftedOffsets.Bytes())
+                       offsets := 
arrow.Int64Traits.CastFromBytes(voffsets.Bytes())[data.Offset() : 
data.Offset()+data.Len()+1]
 
-               startOffset := offsets[0]
-               for i, o := range offsets {
-                       dest[i] = o - startOffset
+                       startOffset := offsets[0]
+                       for i, o := range offsets {
+                               dest[i] = o - startOffset
+                       }
+
+               default:

Review Comment:
   Perhaps add a debug mode assertion that it has the value `4` here?



##########
go/arrow/datatype_binary.go:
##########
@@ -16,6 +16,10 @@
 
 package arrow
 
+type OffsetTraits interface {
+       BytesRequired(int) int

Review Comment:
   Should this get a docstring?



##########
go/arrow/array/list_test.go:
##########
@@ -211,3 +211,190 @@ func TestListArraySlice(t *testing.T) {
                t.Fatalf("got=%q, want=%q", got, want)
        }
 }
+
+func TestLargeListArray(t *testing.T) {
+       pool := memory.NewCheckedAllocator(memory.NewGoAllocator())
+       defer pool.AssertSize(t, 0)
+
+       var (
+               vs      = []int32{0, 1, 2, 3, 4, 5, 6}
+               lengths = []int{3, 0, 4}
+               isValid = []bool{true, false, true}
+               offsets = []int64{0, 3, 3, 7}

Review Comment:
   For the record, perhaps it would be nice to also test a zero-length non-null 
list element as well? e.g.:
   ```suggestion
                lengths = []int{3, 0, 0, 4}
                isValid = []bool{true, false, true, true}
                offsets = []int64{0, 3, 3, 3, 7}
   ```



##########
go/arrow/array/list_test.go:
##########
@@ -211,3 +211,190 @@ func TestListArraySlice(t *testing.T) {
                t.Fatalf("got=%q, want=%q", got, want)
        }
 }
+
+func TestLargeListArray(t *testing.T) {
+       pool := memory.NewCheckedAllocator(memory.NewGoAllocator())
+       defer pool.AssertSize(t, 0)
+
+       var (
+               vs      = []int32{0, 1, 2, 3, 4, 5, 6}
+               lengths = []int{3, 0, 4}
+               isValid = []bool{true, false, true}
+               offsets = []int64{0, 3, 3, 7}
+       )
+
+       lb := array.NewLargeListBuilder(pool, arrow.PrimitiveTypes.Int32)
+       defer lb.Release()
+
+       for i := 0; i < 10; i++ {
+               vb := lb.ValueBuilder().(*array.Int32Builder)
+               vb.Reserve(len(vs))
+
+               pos := 0
+               for i, length := range lengths {
+                       lb.Append(isValid[i])
+                       for j := 0; j < length; j++ {
+                               vb.Append(vs[pos])
+                               pos++
+                       }
+               }
+
+               arr := lb.NewArray().(*array.LargeList)
+               defer arr.Release()
+
+               arr.Retain()
+               arr.Release()
+
+               if got, want := arr.DataType().ID(), arrow.LARGE_LIST; got != 
want {
+                       t.Fatalf("got=%v, want=%v", got, want)

Review Comment:
   Just for the record, doesn't Go have a way to simplify this boilerplate? In 
C++ or Python, we would use custom assertions (e.g. "ASSERT_EQ") provided by 
the testing framework to reduce this to a simple one-liner and yet get good 
error diagnostics.



##########
go/arrow/array/list.go:
##########
@@ -265,19 +427,31 @@ func (b *ListBuilder) NewListArray() (a *List) {
        return
 }
 
-func (b *ListBuilder) newData() (data *Data) {
+// NewListArray creates a List array from the memory buffers used by the 
builder and resets the ListBuilder

Review Comment:
   ```suggestion
   // NewLargeListArray creates a List array from the memory buffers used by 
the builder and resets the LargeListBuilder
   ```



##########
go/arrow/cdata/cdata.go:
##########
@@ -513,14 +551,13 @@ func (imp *cimporter) importFixedSizeBuffer(bufferID int, 
byteWidth int64) *memo
        return imp.importBuffer(bufferID, bufsize)
 }
 
-func (imp *cimporter) importOffsetsBuffer(bufferID int) *memory.Buffer {
-       const offsetsize = int64(arrow.Int32SizeBytes) // go doesn't implement 
int64 offsets yet
+func (imp *cimporter) importOffsetsBuffer(bufferID int, offsetsize int64) 
*memory.Buffer {
        bufsize := offsetsize * int64((imp.arr.length + imp.arr.offset + 1))
        return imp.importBuffer(bufferID, bufsize)
 }
 
-func (imp *cimporter) importVariableValuesBuffer(bufferID int, byteWidth int, 
offsets []int32) *memory.Buffer {
-       bufsize := byteWidth * int(offsets[imp.arr.length])
+func (imp *cimporter) importVariableValuesBuffer(bufferID int, byteWidth int, 
bytelen int) *memory.Buffer {
+       bufsize := byteWidth * bytelen

Review Comment:
   Two things: 1) isn't `bytelen` a bit misleading as a name? 2) shouldn't it 
be an `int64`?
   



##########
go/arrow/array/list_test.go:
##########
@@ -211,3 +211,190 @@ func TestListArraySlice(t *testing.T) {
                t.Fatalf("got=%q, want=%q", got, want)
        }
 }
+
+func TestLargeListArray(t *testing.T) {
+       pool := memory.NewCheckedAllocator(memory.NewGoAllocator())
+       defer pool.AssertSize(t, 0)
+
+       var (
+               vs      = []int32{0, 1, 2, 3, 4, 5, 6}
+               lengths = []int{3, 0, 4}
+               isValid = []bool{true, false, true}
+               offsets = []int64{0, 3, 3, 7}
+       )
+
+       lb := array.NewLargeListBuilder(pool, arrow.PrimitiveTypes.Int32)
+       defer lb.Release()
+
+       for i := 0; i < 10; i++ {

Review Comment:
   Just for my understanding, why are we looping 10 times over the test body 
below? Is it to try and detect memory management issues?



##########
go/arrow/array/list.go:
##########
@@ -253,6 +409,12 @@ func (b *ListBuilder) NewArray() arrow.Array {
        return b.NewListArray()
 }
 
+// NewArray creates a List array from the memory buffers used by the builder 
and resets the ListBuilder

Review Comment:
   Do you want to be specific here?
   ```suggestion
   // NewArray creates a LargeList array from the memory buffers used by the 
builder and resets the LargeListBuilder
   ```



##########
docs/source/status.rst:
##########
@@ -77,7 +77,7 @@ Data Types
 
+-------------------+-------+-------+-------+------------+-------+-------+-------+
 | List              | ✓     | ✓     | ✓     | ✓          |  ✓    |  ✓    | ✓   
  |
 
+-------------------+-------+-------+-------+------------+-------+-------+-------+
-| Large List        | ✓     | ✓     |       |            |       |  ✓    | ✓   
  |
+| Large List        | ✓     | ✓     | ✓     |            |       |  ✓    | ✓   
  |

Review Comment:
   Is the PR title incorrect? This is updating the compatibility matrix for 
Large List support, not Large String and Large Binary which are already ticked 
for Go.



##########
go/arrow/ipc/writer.go:
##########
@@ -562,6 +562,36 @@ func (w *recordEncoder) visit(p *Payload, arr arrow.Array) 
error {
                p.body = append(p.body, voffsets)
                p.body = append(p.body, values)
 
+       case *arrow.LargeBinaryType:

Review Comment:
   Incidentally, same question for String/LargeString and List/LargeList



##########
go/arrow/datatype_binary.go:
##########
@@ -16,6 +16,10 @@
 
 package arrow
 
+type OffsetTraits interface {
+       BytesRequired(int) int

Review Comment:
   Also, does this need to take and return `int64` instead?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to