zeroshade commented on code in PR #35769:
URL: https://github.com/apache/arrow/pull/35769#discussion_r1237592099
##########
go/arrow/array/binarybuilder.go:
##########
@@ -358,6 +359,301 @@ func (b *BinaryBuilder) UnmarshalJSON(data []byte) error {
return b.Unmarshal(dec)
}
+const (
+ dfltBlockSize = 1 << 20 // 1 MB
+ viewValueSizeLimit uint32 = math.MaxUint32
+)
+
+type BinaryViewBuilder struct {
+ builder
+ dtype arrow.BinaryDataType
+
+ data *memory.Buffer
+ rawData []arrow.StringHeader
+
+ blockBuilder multiBufferBuilder
+}
+
+func NewBinaryViewBuilder(mem memory.Allocator) *BinaryViewBuilder {
+ return &BinaryViewBuilder{
+ dtype: arrow.BinaryTypes.BinaryView,
+ builder: builder{
+ refCount: 1,
+ mem: mem,
+ },
+ blockBuilder: multiBufferBuilder{
+ refCount: 1,
+ blockSize: dfltBlockSize,
+ mem: mem,
+ },
+ }
+}
+
+func (b *BinaryViewBuilder) Type() arrow.DataType { return b.dtype }
+
+func (b *BinaryViewBuilder) Release() {
+ debug.Assert(atomic.LoadInt64(&b.refCount) > 0, "too many releases")
+
+ if atomic.AddInt64(&b.refCount, -1) == 0 {
+ if b.nullBitmap != nil {
+ b.nullBitmap.Release()
+ b.nullBitmap = nil
+ }
+ if b.data != nil {
+ b.data.Release()
+ b.data = nil
+ b.rawData = nil
+ }
+ }
+}
+
+func (b *BinaryViewBuilder) init(capacity int) {
+ b.builder.init(capacity)
+ b.data = memory.NewResizableBuffer(b.mem)
+ bytesN := arrow.StringHeaderTraits.BytesRequired(capacity)
+ b.data.Resize(bytesN)
+ b.rawData = arrow.StringHeaderTraits.CastFromBytes(b.data.Bytes())
+}
+
+func (b *BinaryViewBuilder) Resize(n int) {
+ nbuild := n
+ if n < minBuilderCapacity {
+ n = minBuilderCapacity
+ }
+
+ if b.capacity == 0 {
+ b.init(n)
+ } else {
+ b.builder.resize(nbuild, b.init)
+ b.data.Resize(arrow.StringHeaderTraits.BytesRequired(n))
+ b.rawData =
arrow.StringHeaderTraits.CastFromBytes(b.data.Bytes())
+ }
+}
+
+func (b *BinaryViewBuilder) ReserveData(length int) {
+ if uint32(length) > viewValueSizeLimit {
+ panic(fmt.Errorf("%w: BinaryView or StringView elements cannot
reference strings larger than 4GB",
+ arrow.ErrInvalid))
+ }
+ b.blockBuilder.Reserve(int(length))
+}
+
+func (b *BinaryViewBuilder) Reserve(n int) {
+ b.builder.reserve(n, b.Resize)
+}
+
+func (b *BinaryViewBuilder) Append(v []byte) {
+ if uint32(len(v)) > viewValueSizeLimit {
+ panic(fmt.Errorf("%w: BinaryView or StringView elements cannot
reference strings larger than 4GB", arrow.ErrInvalid))
+ }
+
+ if !arrow.IsStringHeaderInline(len(v)) {
+ b.ReserveData(len(v))
+ }
+
+ b.Reserve(1)
+ b.UnsafeAppend(v)
+}
+
+func (b *BinaryViewBuilder) AppendString(v string) {
+ // create a []byte without copying the bytes
+ // in go1.20 this would be unsafe.StringData
+ val := *(*[]byte)(unsafe.Pointer(&struct {
+ string
+ int
+ }{v, len(v)}))
+ b.Append(val)
+}
+
+func (b *BinaryViewBuilder) AppendNull() {
+ b.Reserve(1)
+ b.UnsafeAppendBoolToBitmap(false)
+}
+
+func (b *BinaryViewBuilder) AppendEmptyValue() {
+ b.Reserve(1)
+ b.UnsafeAppendBoolToBitmap(true)
+}
+
+func (b *BinaryViewBuilder) UnsafeAppend(v []byte) {
+ hdr := &b.rawData[b.length]
+ hdr.SetBytes(v)
+ if !hdr.IsInline() {
+ b.blockBuilder.UnsafeAppend(hdr, v)
+ }
+ b.UnsafeAppendBoolToBitmap(true)
+}
+
+func (b *BinaryViewBuilder) AppendValues(v [][]byte, valid []bool) {
+ if len(v) != len(valid) && len(valid) != 0 {
+ panic("len(v) != len(valid) && len(valid) != 0")
+ }
+
+ if len(v) == 0 {
+ return
+ }
+
+ b.Reserve(len(v))
+ outOfLineTotal := 0
+ for i, vv := range v {
+ if len(valid) == 0 || valid[i] {
+ if !arrow.IsStringHeaderInline(len(vv)) {
+ outOfLineTotal += len(vv)
+ }
+ }
+ }
+
+ b.ReserveData(outOfLineTotal)
+ for i, vv := range v {
+ if len(valid) == 0 || valid[i] {
+ hdr := &b.rawData[b.length+i]
+ hdr.SetBytes(vv)
+ if !hdr.IsInline() {
+ b.blockBuilder.UnsafeAppend(hdr, vv)
+ }
+ }
+ }
+
+ b.builder.unsafeAppendBoolsToBitmap(valid, len(v))
+}
+
+func (b *BinaryViewBuilder) AppendStringValues(v []string, valid []bool) {
+ if len(v) != len(valid) && len(valid) != 0 {
+ panic("len(v) != len(valid) && len(valid) != 0")
+ }
+
+ if len(v) == 0 {
+ return
+ }
+
+ b.Reserve(len(v))
+ outOfLineTotal := 0
+ for i, vv := range v {
+ if len(valid) == 0 || valid[i] {
+ if !arrow.IsStringHeaderInline(len(vv)) {
+ outOfLineTotal += len(vv)
+ }
+ }
+ }
+
+ b.ReserveData(outOfLineTotal)
+ for i, vv := range v {
+ if len(valid) == 0 || valid[i] {
+ hdr := &b.rawData[b.length+i]
+ hdr.SetString(vv)
+ if !hdr.IsInline() {
+ b.blockBuilder.UnsafeAppendString(hdr, vv)
+ }
+ }
+ }
+
+ b.builder.unsafeAppendBoolsToBitmap(valid, len(v))
+}
+
+func (b *BinaryViewBuilder) AppendValueFromString(s string) error {
Review Comment:
done, let me know if you think it is sufficient or needs some more
information
##########
go/arrow/array/binary.go:
##########
@@ -318,6 +319,116 @@ func arrayEqualLargeBinary(left, right *LargeBinary) bool
{
return true
}
+type ViewLike interface {
+ arrow.Array
+ ValueHeader(int) *arrow.StringHeader
+}
+
+type BinaryView struct {
+ array
+ values []arrow.StringHeader
+ dataBuffers []*memory.Buffer
+}
+
+func NewBinaryViewData(data arrow.ArrayData) *BinaryView {
+ a := &BinaryView{}
+ a.refCount = 1
+ a.setData(data.(*Data))
+ return a
+}
+
+func (a *BinaryView) setData(data *Data) {
+ if len(data.buffers) < 2 {
+ panic("len(data.buffers) < 2")
+ }
+ a.array.setData(data)
+
+ if valueData := data.buffers[1]; valueData != nil {
+ a.values =
arrow.StringHeaderTraits.CastFromBytes(valueData.Bytes())
+ }
+
+ a.dataBuffers = data.buffers[2:]
+}
+
+func (a *BinaryView) ValueHeader(i int) *arrow.StringHeader {
+ if i < 0 || i >= a.array.data.length {
+ panic("arrow/array: index out of range")
+ }
+ return &a.values[a.array.data.offset+i]
+}
+
+func (a *BinaryView) Value(i int) []byte {
+ s := a.ValueHeader(i)
+ if s.IsInline() {
+ return s.InlineBytes()
+ }
+ start := s.BufferOffset()
+ buf := a.dataBuffers[s.BufferIndex()]
+ return buf.Bytes()[start : start+uint32(s.Len())]
+}
+
+func (a *BinaryView) ValueString(i int) string {
+ b := a.Value(i)
+ return *(*string)(unsafe.Pointer(&b))
+}
+
+func (a *BinaryView) String() string {
+ var o strings.Builder
+ o.WriteString("[")
+ for i := 0; i < a.Len(); i++ {
+ if i > 0 {
+ o.WriteString(" ")
+ }
+ switch {
+ case a.IsNull(i):
+ o.WriteString(NullValueStr)
+ default:
+ fmt.Fprintf(&o, "%q", a.ValueString(i))
+ }
+ }
+ o.WriteString("]")
+ return o.String()
+}
+
+func (a *BinaryView) ValueStr(i int) string {
Review Comment:
done, let me know if you think it is sufficient or needs some more
information
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]