pitrou commented on code in PR #13806:
URL: https://github.com/apache/arrow/pull/13806#discussion_r940165319
##########
go/arrow/internal/arrdata/arrdata.go:
##########
@@ -935,6 +935,64 @@ func makeExtensionRecords() []arrow.Record {
return recs
}
+func makeUnionRecords() []arrow.Record {
+ mem := memory.NewGoAllocator()
+
+ unionFields := []arrow.Field{
+ {Name: "u0", Type: arrow.PrimitiveTypes.Int32, Nullable: true},
+ {Name: "u1", Type: arrow.PrimitiveTypes.Uint8, Nullable: true},
+ }
+
+ typeCodes := []arrow.UnionTypeCode{5, 10}
+ sparseType := arrow.SparseUnionOf(unionFields, typeCodes)
+ denseType := arrow.DenseUnionOf(unionFields, typeCodes)
+
+ schema := arrow.NewSchema([]arrow.Field{
+ {Name: "sparse", Type: sparseType, Nullable: true},
+ {Name: "dense", Type: denseType, Nullable: true},
+ }, nil)
+
+ sparseChildren := make([]arrow.Array, 4)
+ denseChildren := make([]arrow.Array, 4)
+
+ const length = 7
+
+ typeIDsBuffer :=
memory.NewBufferBytes(arrow.Uint8Traits.CastToBytes([]uint8{5, 10, 5, 5, 10,
10, 5}))
+ sparseChildren[0] = arrayOf(mem, []int32{0, 1, 2, 3, 4, 5, 6}, nil)
+ defer sparseChildren[0].Release()
+ sparseChildren[1] = arrayOf(mem, []uint8{10, 11, 12, 13, 14, 15, 16},
nil)
+ defer sparseChildren[1].Release()
+ sparseChildren[2] = arrayOf(mem, []int32{0, -1, -2, -3, -4, -5, -6},
nil)
+ defer sparseChildren[2].Release()
+ sparseChildren[3] = arrayOf(mem, []uint8{100, 101, 102, 103, 104, 105,
106}, nil)
+ defer sparseChildren[3].Release()
+
+ denseChildren[0] = arrayOf(mem, []int32{0, 2, 3, 7}, nil)
+ defer denseChildren[0].Release()
+ denseChildren[1] = arrayOf(mem, []uint8{11, 14, 15}, nil)
+ defer denseChildren[1].Release()
+ denseChildren[2] = arrayOf(mem, []int32{0, -2, -3, -7}, nil)
+ defer denseChildren[2].Release()
+ denseChildren[3] = arrayOf(mem, []uint8{101, 104, 105}, nil)
+ defer denseChildren[3].Release()
Review Comment:
Should you add some nulls in at least some of these child arrays?
##########
go/arrow/internal/arrjson/arrjson.go:
##########
@@ -1152,6 +1176,31 @@ func arrayFromJSON(mem memory.Allocator, dt
arrow.DataType, arr Array) arrow.Arr
defer indices.Release()
return array.NewData(dt, indices.Len(), indices.Buffers(),
indices.Children(), indices.NullN(), indices.Offset())
+ case arrow.UnionType:
+ fields := make([]arrow.ArrayData, len(dt.Fields()))
+ for i, f := range dt.Fields() {
+ child := arrayFromJSON(mem, f.Type, arr.Children[i])
+ defer child.Release()
+ fields[i] = child
+ }
+
+ typeIdBuf :=
memory.NewBufferBytes(arrow.Int8Traits.CastToBytes(arr.TypeID))
+ defer typeIdBuf.Release()
+ buffers := []*memory.Buffer{nil, typeIdBuf}
+ if dt.Mode() == arrow.DenseMode {
+ var offsets []byte
+ if arr.Offset == nil {
+ offsets = []byte{}
Review Comment:
Does that actually happen?
##########
go/arrow/ipc/writer.go:
##########
@@ -478,23 +479,25 @@ func (w *recordEncoder) visit(p *Payload, arr
arrow.Array) error {
return nil
}
- switch arr.NullN() {
- case 0:
- // there are no null values, drop the null bitmap
- p.body = append(p.body, nil)
- default:
- data := arr.Data()
- var bitmap *memory.Buffer
- if data.NullN() == data.Len() {
- // every value is null, just use a new unset bitmap to
avoid the expense of copying
- bitmap = memory.NewResizableBuffer(w.mem)
- minLength :=
paddedLength(bitutil.BytesForBits(int64(data.Len())), kArrowAlignment)
- bitmap.Resize(int(minLength))
- } else {
- // otherwise truncate and copy the bits
- bitmap = newTruncatedBitmap(w.mem,
int64(data.Offset()), int64(data.Len()), data.Buffers()[0])
+ if hasValidityBitmap(arr.DataType().ID(), currentMetadataVersion) {
+ switch arr.NullN() {
+ case 0:
+ // there are no null values, drop the null bitmap
+ p.body = append(p.body, nil)
+ default:
+ data := arr.Data()
+ var bitmap *memory.Buffer
+ if data.NullN() == data.Len() {
+ // every value is null, just use a new unset
bitmap to avoid the expense of copying
Review Comment:
By "unset", do you mean zero-initialized?
##########
go/arrow/ipc/writer.go:
##########
@@ -724,6 +796,34 @@ func (w *recordEncoder) getZeroBasedValueOffsets(arr
arrow.Array) (*memory.Buffe
return voffsets, nil
}
+func (w *recordEncoder) rebaseDenseUnionValueOffsets(arr *array.DenseUnion,
offsets, lengths []int32) *memory.Buffer {
+ // this case sucks. Because the offsets are different for each
+ // child array, when we have a sliced array, we need to re-base
+ // the value offsets for each array! ew.
+ unshiftedOffsets := arr.RawValueOffsets()
+ codes := arr.RawTypeCodes()
+
+ shiftedOffsetsBuf := memory.NewResizableBuffer(w.mem)
+ shiftedOffsetsBuf.Resize(arrow.Int32Traits.BytesRequired(arr.Len()))
+ shiftedOffsets :=
arrow.Int32Traits.CastFromBytes(shiftedOffsetsBuf.Bytes())
+
+ // offsets may not be ascending, so we need to find out the start
offset for each child
Review Comment:
I filed ARROW-17339.
##########
go/arrow/ipc/writer.go:
##########
@@ -724,6 +796,34 @@ func (w *recordEncoder) getZeroBasedValueOffsets(arr
arrow.Array) (*memory.Buffe
return voffsets, nil
}
+func (w *recordEncoder) rebaseDenseUnionValueOffsets(arr *array.DenseUnion,
offsets, lengths []int32) *memory.Buffer {
+ // this case sucks. Because the offsets are different for each
+ // child array, when we have a sliced array, we need to re-base
+ // the value offsets for each array! ew.
+ unshiftedOffsets := arr.RawValueOffsets()
+ codes := arr.RawTypeCodes()
+
+ shiftedOffsetsBuf := memory.NewResizableBuffer(w.mem)
+ shiftedOffsetsBuf.Resize(arrow.Int32Traits.BytesRequired(arr.Len()))
+ shiftedOffsets :=
arrow.Int32Traits.CastFromBytes(shiftedOffsetsBuf.Bytes())
+
+ // offsets may not be ascending, so we need to find out the start
offset for each child
Review Comment:
The [spec](https://arrow.apache.org/docs/format/Columnar.html#dense-union)
says:
> The respective offsets for each child value array must be in order /
increasing.
##########
go/arrow/ipc/file_reader.go:
##########
@@ -617,6 +626,47 @@ func (ctx *arrayLoaderContext) loadStruct(dt
*arrow.StructType) arrow.ArrayData
return array.NewData(dt, int(field.Length()), buffers, subs,
int(field.NullCount()), 0)
}
+func (ctx *arrayLoaderContext) loadUnion(dt arrow.UnionType) arrow.ArrayData {
+ // Sparse unions have 2 buffers (a nil validity bitmap, and the type
ids)
+ nBuffers := 2
+ // Dense unions have a third buffer, the offsets
+ if dt.Mode() == arrow.DenseMode {
+ nBuffers = 3
+ }
+
+ field, buffers := ctx.loadCommon(dt.ID(), nBuffers)
+ if field.NullCount() != 0 && buffers[0] != nil {
+ panic("arrow/ipc: cannot read pre-1.0.0 union array with
top-level validity bitmap")
+ }
+
+ switch field.Length() {
+ case 0:
+ buffers = append(buffers, memory.NewBufferBytes([]byte{}))
Review Comment:
Is this the type codes buffer?
##########
go/arrow/internal/arrjson/arrjson_test.go:
##########
@@ -5085,3 +5086,352 @@ func makeExtensionsWantJSONs() string {
]
}`
}
+
+func makeUnionWantJSONs() string {
Review Comment:
Perhaps make some of the child entries null? VALIDITY seems to be always 1
below.
##########
go/arrow/ipc/writer.go:
##########
@@ -724,6 +796,34 @@ func (w *recordEncoder) getZeroBasedValueOffsets(arr
arrow.Array) (*memory.Buffe
return voffsets, nil
}
+func (w *recordEncoder) rebaseDenseUnionValueOffsets(arr *array.DenseUnion,
offsets, lengths []int32) *memory.Buffer {
+ // this case sucks. Because the offsets are different for each
+ // child array, when we have a sliced array, we need to re-base
+ // the value offsets for each array! ew.
+ unshiftedOffsets := arr.RawValueOffsets()
+ codes := arr.RawTypeCodes()
+
+ shiftedOffsetsBuf := memory.NewResizableBuffer(w.mem)
+ shiftedOffsetsBuf.Resize(arrow.Int32Traits.BytesRequired(arr.Len()))
+ shiftedOffsets :=
arrow.Int32Traits.CastFromBytes(shiftedOffsetsBuf.Bytes())
+
+ // offsets may not be ascending, so we need to find out the start
offset for each child
Review Comment:
I see you basically copied the C++ implementation. I'll open a JIRA to
simplify it.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]