lquerel commented on code in PR #35769:
URL: https://github.com/apache/arrow/pull/35769#discussion_r1235818429
##########
go/arrow/array/binary.go:
##########
@@ -318,6 +319,116 @@ func arrayEqualLargeBinary(left, right *LargeBinary) bool
{
return true
}
+type ViewLike interface {
+ arrow.Array
+ ValueHeader(int) *arrow.StringHeader
+}
+
+type BinaryView struct {
+ array
+ values []arrow.StringHeader
+ dataBuffers []*memory.Buffer
+}
+
+func NewBinaryViewData(data arrow.ArrayData) *BinaryView {
+ a := &BinaryView{}
+ a.refCount = 1
+ a.setData(data.(*Data))
+ return a
+}
+
+func (a *BinaryView) setData(data *Data) {
+ if len(data.buffers) < 2 {
+ panic("len(data.buffers) < 2")
+ }
+ a.array.setData(data)
+
+ if valueData := data.buffers[1]; valueData != nil {
+ a.values =
arrow.StringHeaderTraits.CastFromBytes(valueData.Bytes())
+ }
+
+ a.dataBuffers = data.buffers[2:]
+}
+
+func (a *BinaryView) ValueHeader(i int) *arrow.StringHeader {
+ if i < 0 || i >= a.array.data.length {
+ panic("arrow/array: index out of range")
+ }
+ return &a.values[a.array.data.offset+i]
+}
+
+func (a *BinaryView) Value(i int) []byte {
Review Comment:
Since there is no compiler guarantee in Go that the slice returned by this
method will remain unchanged (whether intentionally or unintentionally) by
users of this API, I would suggest adding a comment to this method. This
comment should clearly specify that it's unsafe to make any kind of changes to
the returned slice.
##########
go/arrow/array/binary.go:
##########
@@ -318,6 +319,116 @@ func arrayEqualLargeBinary(left, right *LargeBinary) bool
{
return true
}
+type ViewLike interface {
+ arrow.Array
+ ValueHeader(int) *arrow.StringHeader
+}
+
+type BinaryView struct {
+ array
+ values []arrow.StringHeader
+ dataBuffers []*memory.Buffer
+}
+
+func NewBinaryViewData(data arrow.ArrayData) *BinaryView {
+ a := &BinaryView{}
+ a.refCount = 1
+ a.setData(data.(*Data))
+ return a
+}
+
+func (a *BinaryView) setData(data *Data) {
+ if len(data.buffers) < 2 {
+ panic("len(data.buffers) < 2")
+ }
+ a.array.setData(data)
+
+ if valueData := data.buffers[1]; valueData != nil {
+ a.values =
arrow.StringHeaderTraits.CastFromBytes(valueData.Bytes())
+ }
+
+ a.dataBuffers = data.buffers[2:]
+}
+
+func (a *BinaryView) ValueHeader(i int) *arrow.StringHeader {
+ if i < 0 || i >= a.array.data.length {
+ panic("arrow/array: index out of range")
+ }
+ return &a.values[a.array.data.offset+i]
+}
+
+func (a *BinaryView) Value(i int) []byte {
+ s := a.ValueHeader(i)
+ if s.IsInline() {
+ return s.InlineBytes()
+ }
+ start := s.BufferOffset()
+ buf := a.dataBuffers[s.BufferIndex()]
+ return buf.Bytes()[start : start+uint32(s.Len())]
+}
+
+func (a *BinaryView) ValueString(i int) string {
+ b := a.Value(i)
+ return *(*string)(unsafe.Pointer(&b))
+}
+
+func (a *BinaryView) String() string {
+ var o strings.Builder
+ o.WriteString("[")
+ for i := 0; i < a.Len(); i++ {
+ if i > 0 {
+ o.WriteString(" ")
+ }
+ switch {
+ case a.IsNull(i):
+ o.WriteString(NullValueStr)
+ default:
+ fmt.Fprintf(&o, "%q", a.ValueString(i))
+ }
+ }
+ o.WriteString("]")
+ return o.String()
+}
+
+func (a *BinaryView) ValueStr(i int) string {
Review Comment:
It would likely help users to have a comment describing the differences
between this method and `ValueString`.
##########
go/arrow/array/binarybuilder.go:
##########
@@ -358,6 +359,301 @@ func (b *BinaryBuilder) UnmarshalJSON(data []byte) error {
return b.Unmarshal(dec)
}
+const (
+ dfltBlockSize = 1 << 20 // 1 MB
+ viewValueSizeLimit uint32 = math.MaxUint32
+)
+
+type BinaryViewBuilder struct {
+ builder
+ dtype arrow.BinaryDataType
+
+ data *memory.Buffer
+ rawData []arrow.StringHeader
+
+ blockBuilder multiBufferBuilder
+}
+
+func NewBinaryViewBuilder(mem memory.Allocator) *BinaryViewBuilder {
+ return &BinaryViewBuilder{
+ dtype: arrow.BinaryTypes.BinaryView,
+ builder: builder{
+ refCount: 1,
+ mem: mem,
+ },
+ blockBuilder: multiBufferBuilder{
+ refCount: 1,
+ blockSize: dfltBlockSize,
+ mem: mem,
+ },
+ }
+}
+
+func (b *BinaryViewBuilder) Type() arrow.DataType { return b.dtype }
+
+func (b *BinaryViewBuilder) Release() {
+ debug.Assert(atomic.LoadInt64(&b.refCount) > 0, "too many releases")
+
+ if atomic.AddInt64(&b.refCount, -1) == 0 {
+ if b.nullBitmap != nil {
+ b.nullBitmap.Release()
+ b.nullBitmap = nil
+ }
+ if b.data != nil {
+ b.data.Release()
+ b.data = nil
+ b.rawData = nil
+ }
+ }
+}
+
+func (b *BinaryViewBuilder) init(capacity int) {
+ b.builder.init(capacity)
+ b.data = memory.NewResizableBuffer(b.mem)
+ bytesN := arrow.StringHeaderTraits.BytesRequired(capacity)
+ b.data.Resize(bytesN)
+ b.rawData = arrow.StringHeaderTraits.CastFromBytes(b.data.Bytes())
+}
+
+func (b *BinaryViewBuilder) Resize(n int) {
+ nbuild := n
+ if n < minBuilderCapacity {
+ n = minBuilderCapacity
+ }
+
+ if b.capacity == 0 {
+ b.init(n)
+ } else {
+ b.builder.resize(nbuild, b.init)
+ b.data.Resize(arrow.StringHeaderTraits.BytesRequired(n))
+ b.rawData =
arrow.StringHeaderTraits.CastFromBytes(b.data.Bytes())
+ }
+}
+
+func (b *BinaryViewBuilder) ReserveData(length int) {
+ if uint32(length) > viewValueSizeLimit {
+ panic(fmt.Errorf("%w: BinaryView or StringView elements cannot
reference strings larger than 4GB",
+ arrow.ErrInvalid))
+ }
+ b.blockBuilder.Reserve(int(length))
+}
+
+func (b *BinaryViewBuilder) Reserve(n int) {
+ b.builder.reserve(n, b.Resize)
+}
+
+func (b *BinaryViewBuilder) Append(v []byte) {
+ if uint32(len(v)) > viewValueSizeLimit {
+ panic(fmt.Errorf("%w: BinaryView or StringView elements cannot
reference strings larger than 4GB", arrow.ErrInvalid))
+ }
+
+ if !arrow.IsStringHeaderInline(len(v)) {
+ b.ReserveData(len(v))
+ }
+
+ b.Reserve(1)
+ b.UnsafeAppend(v)
+}
+
+func (b *BinaryViewBuilder) AppendString(v string) {
+ // create a []byte without copying the bytes
+ // in go1.20 this would be unsafe.StringData
+ val := *(*[]byte)(unsafe.Pointer(&struct {
+ string
+ int
+ }{v, len(v)}))
+ b.Append(val)
+}
+
+func (b *BinaryViewBuilder) AppendNull() {
+ b.Reserve(1)
+ b.UnsafeAppendBoolToBitmap(false)
+}
+
+func (b *BinaryViewBuilder) AppendEmptyValue() {
+ b.Reserve(1)
+ b.UnsafeAppendBoolToBitmap(true)
+}
+
+func (b *BinaryViewBuilder) UnsafeAppend(v []byte) {
+ hdr := &b.rawData[b.length]
+ hdr.SetBytes(v)
+ if !hdr.IsInline() {
+ b.blockBuilder.UnsafeAppend(hdr, v)
+ }
+ b.UnsafeAppendBoolToBitmap(true)
+}
+
+func (b *BinaryViewBuilder) AppendValues(v [][]byte, valid []bool) {
+ if len(v) != len(valid) && len(valid) != 0 {
+ panic("len(v) != len(valid) && len(valid) != 0")
+ }
+
+ if len(v) == 0 {
+ return
+ }
+
+ b.Reserve(len(v))
+ outOfLineTotal := 0
+ for i, vv := range v {
+ if len(valid) == 0 || valid[i] {
+ if !arrow.IsStringHeaderInline(len(vv)) {
+ outOfLineTotal += len(vv)
+ }
+ }
+ }
+
+ b.ReserveData(outOfLineTotal)
+ for i, vv := range v {
+ if len(valid) == 0 || valid[i] {
+ hdr := &b.rawData[b.length+i]
+ hdr.SetBytes(vv)
+ if !hdr.IsInline() {
+ b.blockBuilder.UnsafeAppend(hdr, vv)
+ }
+ }
+ }
+
+ b.builder.unsafeAppendBoolsToBitmap(valid, len(v))
+}
+
+func (b *BinaryViewBuilder) AppendStringValues(v []string, valid []bool) {
+ if len(v) != len(valid) && len(valid) != 0 {
+ panic("len(v) != len(valid) && len(valid) != 0")
+ }
+
+ if len(v) == 0 {
+ return
+ }
+
+ b.Reserve(len(v))
+ outOfLineTotal := 0
+ for i, vv := range v {
+ if len(valid) == 0 || valid[i] {
+ if !arrow.IsStringHeaderInline(len(vv)) {
+ outOfLineTotal += len(vv)
+ }
+ }
+ }
+
+ b.ReserveData(outOfLineTotal)
+ for i, vv := range v {
+ if len(valid) == 0 || valid[i] {
+ hdr := &b.rawData[b.length+i]
+ hdr.SetString(vv)
+ if !hdr.IsInline() {
+ b.blockBuilder.UnsafeAppendString(hdr, vv)
+ }
+ }
+ }
+
+ b.builder.unsafeAppendBoolsToBitmap(valid, len(v))
+}
+
+func (b *BinaryViewBuilder) AppendValueFromString(s string) error {
Review Comment:
It would likely help users to have a comment describing the differences
between this method and `AppendString`.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]