[GitHub] [arrow] pitrou commented on a diff in pull request #13806: ARROW-17276: [Go][Integration] Implement IPC handling for union type

GitBox Mon, 08 Aug 2022 05:51:01 -0700


pitrou commented on code in PR #13806:
URL: https://github.com/apache/arrow/pull/13806#discussion_r940165319



##########
go/arrow/internal/arrdata/arrdata.go:
##########
@@ -935,6 +935,64 @@ func makeExtensionRecords() []arrow.Record {
        return recs
 }
 
+func makeUnionRecords() []arrow.Record {
+       mem := memory.NewGoAllocator()
+
+       unionFields := []arrow.Field{
+               {Name: "u0", Type: arrow.PrimitiveTypes.Int32, Nullable: true},
+               {Name: "u1", Type: arrow.PrimitiveTypes.Uint8, Nullable: true},
+       }
+
+       typeCodes := []arrow.UnionTypeCode{5, 10}
+       sparseType := arrow.SparseUnionOf(unionFields, typeCodes)
+       denseType := arrow.DenseUnionOf(unionFields, typeCodes)
+
+       schema := arrow.NewSchema([]arrow.Field{
+               {Name: "sparse", Type: sparseType, Nullable: true},
+               {Name: "dense", Type: denseType, Nullable: true},
+       }, nil)
+
+       sparseChildren := make([]arrow.Array, 4)
+       denseChildren := make([]arrow.Array, 4)
+
+       const length = 7
+
+       typeIDsBuffer := 
memory.NewBufferBytes(arrow.Uint8Traits.CastToBytes([]uint8{5, 10, 5, 5, 10, 
10, 5}))
+       sparseChildren[0] = arrayOf(mem, []int32{0, 1, 2, 3, 4, 5, 6}, nil)
+       defer sparseChildren[0].Release()
+       sparseChildren[1] = arrayOf(mem, []uint8{10, 11, 12, 13, 14, 15, 16}, 
nil)
+       defer sparseChildren[1].Release()
+       sparseChildren[2] = arrayOf(mem, []int32{0, -1, -2, -3, -4, -5, -6}, 
nil)
+       defer sparseChildren[2].Release()
+       sparseChildren[3] = arrayOf(mem, []uint8{100, 101, 102, 103, 104, 105, 
106}, nil)
+       defer sparseChildren[3].Release()
+
+       denseChildren[0] = arrayOf(mem, []int32{0, 2, 3, 7}, nil)
+       defer denseChildren[0].Release()
+       denseChildren[1] = arrayOf(mem, []uint8{11, 14, 15}, nil)
+       defer denseChildren[1].Release()
+       denseChildren[2] = arrayOf(mem, []int32{0, -2, -3, -7}, nil)
+       defer denseChildren[2].Release()
+       denseChildren[3] = arrayOf(mem, []uint8{101, 104, 105}, nil)
+       defer denseChildren[3].Release()

Review Comment:
   Should you add some nulls in at least some of these child arrays?



##########
go/arrow/internal/arrjson/arrjson.go:
##########
@@ -1152,6 +1176,31 @@ func arrayFromJSON(mem memory.Allocator, dt 
arrow.DataType, arr Array) arrow.Arr
                defer indices.Release()
                return array.NewData(dt, indices.Len(), indices.Buffers(), 
indices.Children(), indices.NullN(), indices.Offset())
 
+       case arrow.UnionType:
+               fields := make([]arrow.ArrayData, len(dt.Fields()))
+               for i, f := range dt.Fields() {
+                       child := arrayFromJSON(mem, f.Type, arr.Children[i])
+                       defer child.Release()
+                       fields[i] = child
+               }
+
+               typeIdBuf := 
memory.NewBufferBytes(arrow.Int8Traits.CastToBytes(arr.TypeID))
+               defer typeIdBuf.Release()
+               buffers := []*memory.Buffer{nil, typeIdBuf}
+               if dt.Mode() == arrow.DenseMode {
+                       var offsets []byte
+                       if arr.Offset == nil {
+                               offsets = []byte{}

Review Comment:
   Does that actually happen?



##########
go/arrow/ipc/writer.go:
##########
@@ -478,23 +479,25 @@ func (w *recordEncoder) visit(p *Payload, arr 
arrow.Array) error {
                return nil
        }
 
-       switch arr.NullN() {
-       case 0:
-               // there are no null values, drop the null bitmap
-               p.body = append(p.body, nil)
-       default:
-               data := arr.Data()
-               var bitmap *memory.Buffer
-               if data.NullN() == data.Len() {
-                       // every value is null, just use a new unset bitmap to 
avoid the expense of copying
-                       bitmap = memory.NewResizableBuffer(w.mem)
-                       minLength := 
paddedLength(bitutil.BytesForBits(int64(data.Len())), kArrowAlignment)
-                       bitmap.Resize(int(minLength))
-               } else {
-                       // otherwise truncate and copy the bits
-                       bitmap = newTruncatedBitmap(w.mem, 
int64(data.Offset()), int64(data.Len()), data.Buffers()[0])
+       if hasValidityBitmap(arr.DataType().ID(), currentMetadataVersion) {
+               switch arr.NullN() {
+               case 0:
+                       // there are no null values, drop the null bitmap
+                       p.body = append(p.body, nil)
+               default:
+                       data := arr.Data()
+                       var bitmap *memory.Buffer
+                       if data.NullN() == data.Len() {
+                               // every value is null, just use a new unset 
bitmap to avoid the expense of copying

Review Comment:
   By "unset", do you mean zero-initialized?



##########
go/arrow/ipc/writer.go:
##########
@@ -724,6 +796,34 @@ func (w *recordEncoder) getZeroBasedValueOffsets(arr 
arrow.Array) (*memory.Buffe
        return voffsets, nil
 }
 
+func (w *recordEncoder) rebaseDenseUnionValueOffsets(arr *array.DenseUnion, 
offsets, lengths []int32) *memory.Buffer {
+       // this case sucks. Because the offsets are different for each
+       // child array, when we have a sliced array, we need to re-base
+       // the value offsets for each array! ew.
+       unshiftedOffsets := arr.RawValueOffsets()
+       codes := arr.RawTypeCodes()
+
+       shiftedOffsetsBuf := memory.NewResizableBuffer(w.mem)
+       shiftedOffsetsBuf.Resize(arrow.Int32Traits.BytesRequired(arr.Len()))
+       shiftedOffsets := 
arrow.Int32Traits.CastFromBytes(shiftedOffsetsBuf.Bytes())
+
+       // offsets may not be ascending, so we need to find out the start 
offset for each child

Review Comment:
   I filed ARROW-17339.



##########
go/arrow/ipc/writer.go:
##########
@@ -724,6 +796,34 @@ func (w *recordEncoder) getZeroBasedValueOffsets(arr 
arrow.Array) (*memory.Buffe
        return voffsets, nil
 }
 
+func (w *recordEncoder) rebaseDenseUnionValueOffsets(arr *array.DenseUnion, 
offsets, lengths []int32) *memory.Buffer {
+       // this case sucks. Because the offsets are different for each
+       // child array, when we have a sliced array, we need to re-base
+       // the value offsets for each array! ew.
+       unshiftedOffsets := arr.RawValueOffsets()
+       codes := arr.RawTypeCodes()
+
+       shiftedOffsetsBuf := memory.NewResizableBuffer(w.mem)
+       shiftedOffsetsBuf.Resize(arrow.Int32Traits.BytesRequired(arr.Len()))
+       shiftedOffsets := 
arrow.Int32Traits.CastFromBytes(shiftedOffsetsBuf.Bytes())
+
+       // offsets may not be ascending, so we need to find out the start 
offset for each child

Review Comment:
   The [spec](https://arrow.apache.org/docs/format/Columnar.html#dense-union) 
says:
   > The respective offsets for each child value array must be in order / 
increasing.



##########
go/arrow/ipc/file_reader.go:
##########
@@ -617,6 +626,47 @@ func (ctx *arrayLoaderContext) loadStruct(dt 
*arrow.StructType) arrow.ArrayData
        return array.NewData(dt, int(field.Length()), buffers, subs, 
int(field.NullCount()), 0)
 }
 
+func (ctx *arrayLoaderContext) loadUnion(dt arrow.UnionType) arrow.ArrayData {
+       // Sparse unions have 2 buffers (a nil validity bitmap, and the type 
ids)
+       nBuffers := 2
+       // Dense unions have a third buffer, the offsets
+       if dt.Mode() == arrow.DenseMode {
+               nBuffers = 3
+       }
+
+       field, buffers := ctx.loadCommon(dt.ID(), nBuffers)
+       if field.NullCount() != 0 && buffers[0] != nil {
+               panic("arrow/ipc: cannot read pre-1.0.0 union array with 
top-level validity bitmap")
+       }
+
+       switch field.Length() {
+       case 0:
+               buffers = append(buffers, memory.NewBufferBytes([]byte{}))

Review Comment:
   Is this the type codes buffer?



##########
go/arrow/internal/arrjson/arrjson_test.go:
##########
@@ -5085,3 +5086,352 @@ func makeExtensionsWantJSONs() string {
   ]
 }`
 }
+
+func makeUnionWantJSONs() string {

Review Comment:
   Perhaps make some of the child entries null? VALIDITY seems to be always 1 
below.



##########
go/arrow/ipc/writer.go:
##########
@@ -724,6 +796,34 @@ func (w *recordEncoder) getZeroBasedValueOffsets(arr 
arrow.Array) (*memory.Buffe
        return voffsets, nil
 }
 
+func (w *recordEncoder) rebaseDenseUnionValueOffsets(arr *array.DenseUnion, 
offsets, lengths []int32) *memory.Buffer {
+       // this case sucks. Because the offsets are different for each
+       // child array, when we have a sliced array, we need to re-base
+       // the value offsets for each array! ew.
+       unshiftedOffsets := arr.RawValueOffsets()
+       codes := arr.RawTypeCodes()
+
+       shiftedOffsetsBuf := memory.NewResizableBuffer(w.mem)
+       shiftedOffsetsBuf.Resize(arrow.Int32Traits.BytesRequired(arr.Len()))
+       shiftedOffsets := 
arrow.Int32Traits.CastFromBytes(shiftedOffsetsBuf.Bytes())
+
+       // offsets may not be ascending, so we need to find out the start 
offset for each child

Review Comment:
   I see you basically copied the C++ implementation. I'll open a JIRA to 
simplify it.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

[GitHub] [arrow] pitrou commented on a diff in pull request #13806: ARROW-17276: [Go][Integration] Implement IPC handling for union type

Reply via email to