zeroshade commented on code in PR #13768:
URL: https://github.com/apache/arrow/pull/13768#discussion_r938173018
##########
go/arrow/datatype_nested.go:
##########
@@ -384,6 +388,286 @@ func (t *MapType) Layout() DataTypeLayout {
func (MapType) OffsetTypeTraits() OffsetTraits { return Int32Traits }
+type (
+ // UnionTypeCode is an alias to int8 which is the type of the ids
+ // used for union arrays.
+ UnionTypeCode = int8
+ UnionMode int8
+)
+
+const (
+ MaxUnionTypeCode UnionTypeCode = 127
+ InvalidUnionChildID int = -1
+
+ SparseMode UnionMode = iota
+ DenseMode
+)
+
+// UnionType is an interface to encompass both Dense and Sparse Union types.
+//
+// A UnionType is a nested type where each logical value is taken
+// from a single child. A buffer of 8-bit type ids (typed as UnionTypeCode)
+// indicates which child a given logical value is to be taken from. This is
+// represented as the "child id" or "child index", which is the index into the
+// list of child fields for a given child.
+type UnionType interface {
+ NestedType
+ // Mode returns either SparseMode or DenseMode depending on the current
+ // concrete data type.
+ Mode() UnionMode
+ // ChildIDs returns a slice of ints to map UnionTypeCode values to
+ // the index in the Fields that represents the given Type. It is
+ // initialized with all values being InvalidUnionChildID (-1)
+ // before being populated based on the TypeCodes and fields of the type.
+ // The field for a given type can be retrieved by
Fields()[ChildIDs()[typeCode]]
+ ChildIDs() []int
+ // TypeCodes returns the list of available type codes for this union
type
+ // which will correspond to indexes into the ChildIDs slice to locate
the
+ // appropriate child. A union Array contains a buffer of these type
codes
+ // which indicate for a given index, which child has the value for that
index.
+ TypeCodes() []UnionTypeCode
+ // MaxTypeCode returns the value of the largest TypeCode in the list of
typecodes
+ // that are defined by this Union type
+ MaxTypeCode() UnionTypeCode
+}
+
+// UnionOf returns an appropriate union type for the given Mode (Sparse or
Dense),
+// child fields, and type codes. len(fields) == len(typeCodes) must be true,
or else
+// this will panic. len(fields) can be 0.
+func UnionOf(mode UnionMode, fields []Field, typeCodes []UnionTypeCode)
UnionType {
+ switch mode {
+ case SparseMode:
+ return SparseUnionOf(fields, typeCodes)
+ case DenseMode:
+ return DenseUnionOf(fields, typeCodes)
+ default:
+ panic("arrow: invalid union mode")
+ }
+}
+
+type unionType struct {
+ children []Field
+ typeCodes []UnionTypeCode
+ childIDs [int(MaxUnionTypeCode) + 1]int
+}
+
+func (t *unionType) init(fields []Field, typeCodes []UnionTypeCode) {
+ // initialize all child IDs to -1
+ t.childIDs[0] = InvalidUnionChildID
+ for i := 1; i < len(t.childIDs); i *= 2 {
+ copy(t.childIDs[i:], t.childIDs[:i])
+ }
+
+ t.children = fields
+ t.typeCodes = typeCodes
+
+ for i, tc := range t.typeCodes {
+ t.childIDs[tc] = i
+ }
+}
+
+func (t unionType) Fields() []Field { return t.children }
+func (t unionType) TypeCodes() []UnionTypeCode { return t.typeCodes }
+func (t unionType) ChildIDs() []int { return t.childIDs[:] }
+
+func (t *unionType) validate(fields []Field, typeCodes []UnionTypeCode, _
UnionMode) error {
+ if len(fields) != len(typeCodes) {
+ return errors.New("arrow: union types should have the same
number of fields as type codes")
+ }
+
+ for _, c := range typeCodes {
+ if c < 0 || c > MaxUnionTypeCode {
+ return errors.New("arrow: union type code out of
bounds")
+ }
+ }
+ return nil
+}
+
+func (t *unionType) MaxTypeCode() (max UnionTypeCode) {
+ if len(t.typeCodes) == 0 {
+ return
+ }
+
+ max = t.typeCodes[0]
+ for _, c := range t.typeCodes[1:] {
+ if c > max {
+ max = c
+ }
+ }
+ return
+}
+
+func (t *unionType) String() string {
+ var b strings.Builder
+ b.WriteByte('<')
+ for i := range t.typeCodes {
+ if i != 0 {
+ b.WriteString(", ")
+ }
+ fmt.Fprintf(&b, "%s=%d", t.children[i], t.typeCodes[i])
+ }
+ b.WriteByte('>')
+ return b.String()
+}
+
+func (t *unionType) fingerprint() string {
+ var b strings.Builder
+ for _, c := range t.typeCodes {
+ fmt.Fprintf(&b, ":%d", c)
+ }
+ b.WriteString("]{")
+ for _, c := range t.children {
+ fingerprint := c.Fingerprint()
+ if len(fingerprint) == 0 {
+ return ""
+ }
+ b.WriteString(fingerprint)
+ b.WriteByte(';')
+ }
+ b.WriteByte('}')
+ return b.String()
+}
+
+func fieldsFromArrays(arrays []Array, names ...string) (ret []Field) {
+ ret = make([]Field, len(arrays))
+ if len(names) == 0 {
+ for i, c := range arrays {
+ ret[i] = Field{Name: strconv.Itoa(i), Type:
c.DataType(), Nullable: true}
+ }
+ } else {
+ debug.Assert(len(names) == len(arrays), "mismatch of arrays and
names")
+ for i, c := range arrays {
+ ret[i] = Field{Name: names[i], Type: c.DataType(),
Nullable: true}
+ }
+ }
+ return
+}
+
+// SparseUnionType is the concrete type for Sparse union data.
+//
+// A sparse union is a nested type where each logical value is taken
+// from a single child. A buffer of 8-bit type ids indicates which child
+// a given logical value is to be taken from.
+//
+// In a sparse union, each child array will have the same length as the
+// union array itself, regardless of the actual number of union values which
+// refer to it.
+//
+// Unlike most other types, unions do not have a top-level validity bitmap.
+type SparseUnionType struct {
+ unionType
+}
+
+// SparseUnionFromArrays enables creating a union type from a list of Arrays,
+// field names, and type codes. len(fields) should be either 0 or equal to
len(children).
+// len(codes) should also be either 0, or equal to len(children).
+//
+// If len(fields) == 0, then the fields will be named numerically as "0", "1",
"2"...
+// and so on. If len(codes) == 0, then the type codes will be constructed as
+// [0, 1, 2, ..., n].
+func SparseUnionFromArrays(children []Array, fields []string, codes
[]UnionTypeCode) *SparseUnionType {
+ if len(codes) == 0 {
+ codes = make([]UnionTypeCode, len(children))
+ for i := range children {
+ codes[i] = UnionTypeCode(i)
+ }
+ }
+ return SparseUnionOf(fieldsFromArrays(children, fields...), codes)
+}
+
+// SparseUnionOf is equivalent to UnionOf(arrow.SparseMode, fields, typeCodes),
+// constructing a SparseUnionType from a list of fields and type codes.
+//
+// If len(fields) != len(typeCodes) this will panic. They are allowed to be
+// of length 0.
+func SparseUnionOf(fields []Field, typeCodes []UnionTypeCode) *SparseUnionType
{
+ ret := &SparseUnionType{}
Review Comment:
so, it would actually be more consistent to swap out the usages of `new`
since instantiating using `&Type{}` is the consistent style in the existing
codebase. (look elsewhere at the other `*Of(...)` functions or any of the
`New*Data` functions in the array package). I just didn't change the usages of
`new(strings.Builder)` since I wasn't modifying the existing code. I'll swap
them out here in order to make that more consistent.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]