This is an automated email from the ASF dual-hosted git repository.
zeroshade pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-go.git
The following commit(s) were added to refs/heads/main by this push:
new c03dad8f perf(arrow): Reduce the amount of allocated objects (#645)
c03dad8f is described below
commit c03dad8f3c87ae72cc9a356fc573ebdae5d1a622
Author: Stas Spiridonov <[email protected]>
AuthorDate: Tue Jan 27 15:37:41 2026 -0500
perf(arrow): Reduce the amount of allocated objects (#645)
### Rationale for this change
`apache/arrow-go` is used in the new query engine in Grafana Loki and we
are actively working on improving its performance. Here are few low
hanging fruits that reduce allocations by many gigabytes when used on
the hot path.
### What changes are included in this PR?
This PR reduces amount of allocated objects.
* `arrow/datatype_binary.go`: straightforward, those `Layout` objects
are allocated on each call currently.
* `arrow/schema.go`: `Schema.Fields()` does not return a clone anymore
and `NewSchemaWithEndian()` does not clone inputs anymore. All call
sites are func-signature compatible, but should not modify those
args/results anymore. In our opinion this is a reasonable trade-off for
higher performance. I am happy to discuss other ways to implement that.
### Are these changes tested?
* Tested with the current `apache/arrow-go` unit test suite.
* Tested with `grafana/loki` test suite for the query engine.
### Are there any user-facing changes?
Yes. Explained above.
---
arrow/datatype_binary.go | 130 ++++++++++++++++++++++++-----------------------
arrow/schema.go | 50 +++++++++++++-----
2 files changed, 103 insertions(+), 77 deletions(-)
diff --git a/arrow/datatype_binary.go b/arrow/datatype_binary.go
index f3e601f0..f6b6b322 100644
--- a/arrow/datatype_binary.go
+++ b/arrow/datatype_binary.go
@@ -27,95 +27,97 @@ type OffsetTraits interface {
BytesRequired(int) int
}
+var (
+ binaryTypeLayout = DataTypeLayout{
+ Buffers: []BufferSpec{SpecBitmap(),
SpecFixedWidth(Int32SizeBytes), SpecVariableWidth()},
+ }
+ stringTypeLayout = DataTypeLayout{
+ Buffers: []BufferSpec{SpecBitmap(),
SpecFixedWidth(Int32SizeBytes), SpecVariableWidth()},
+ }
+ largeBinaryTypeLayout = DataTypeLayout{
+ Buffers: []BufferSpec{SpecBitmap(),
SpecFixedWidth(Int64SizeBytes), SpecVariableWidth()},
+ }
+ largeStringTypeLayout = DataTypeLayout{
+ Buffers: []BufferSpec{SpecBitmap(),
SpecFixedWidth(Int64SizeBytes), SpecVariableWidth()},
+ }
+
+ variadic = SpecVariableWidth()
+
+ binaryViewTypeLayout = DataTypeLayout{
+ Buffers: []BufferSpec{SpecBitmap(),
SpecFixedWidth(ViewHeaderSizeBytes)},
+ VariadicSpec: &variadic,
+ }
+ stringViewTypeLayout = DataTypeLayout{
+ Buffers: []BufferSpec{SpecBitmap(),
SpecFixedWidth(ViewHeaderSizeBytes)},
+ VariadicSpec: &variadic,
+ }
+)
+
type BinaryType struct{}
-func (t *BinaryType) ID() Type { return BINARY }
-func (t *BinaryType) Name() string { return "binary" }
-func (t *BinaryType) String() string { return "binary" }
-func (t *BinaryType) binary() {}
-func (t *BinaryType) Fingerprint() string { return typeFingerprint(t) }
-func (t *BinaryType) Layout() DataTypeLayout {
- return DataTypeLayout{Buffers: []BufferSpec{SpecBitmap(),
- SpecFixedWidth(Int32SizeBytes), SpecVariableWidth()}}
-}
+func (t *BinaryType) ID() Type { return BINARY }
+func (t *BinaryType) Name() string { return "binary" }
+func (t *BinaryType) String() string { return "binary" }
+func (t *BinaryType) binary() {}
+func (t *BinaryType) Fingerprint() string { return
typeFingerprint(t) }
+func (t *BinaryType) Layout() DataTypeLayout { return binaryTypeLayout
}
func (t *BinaryType) OffsetTypeTraits() OffsetTraits { return Int32Traits }
func (BinaryType) IsUtf8() bool { return false }
type StringType struct{}
-func (t *StringType) ID() Type { return STRING }
-func (t *StringType) Name() string { return "utf8" }
-func (t *StringType) String() string { return "utf8" }
-func (t *StringType) binary() {}
-func (t *StringType) Fingerprint() string { return typeFingerprint(t) }
-func (t *StringType) Layout() DataTypeLayout {
- return DataTypeLayout{Buffers: []BufferSpec{SpecBitmap(),
- SpecFixedWidth(Int32SizeBytes), SpecVariableWidth()}}
-}
+func (t *StringType) ID() Type { return STRING }
+func (t *StringType) Name() string { return "utf8" }
+func (t *StringType) String() string { return "utf8" }
+func (t *StringType) binary() {}
+func (t *StringType) Fingerprint() string { return
typeFingerprint(t) }
+func (t *StringType) Layout() DataTypeLayout { return stringTypeLayout
}
func (t *StringType) OffsetTypeTraits() OffsetTraits { return Int32Traits }
func (StringType) IsUtf8() bool { return true }
type LargeBinaryType struct{}
-func (t *LargeBinaryType) ID() Type { return LARGE_BINARY }
-func (t *LargeBinaryType) Name() string { return "large_binary" }
-func (t *LargeBinaryType) String() string { return "large_binary" }
-func (t *LargeBinaryType) binary() {}
-func (t *LargeBinaryType) Fingerprint() string { return typeFingerprint(t) }
-func (t *LargeBinaryType) Layout() DataTypeLayout {
- return DataTypeLayout{Buffers: []BufferSpec{SpecBitmap(),
- SpecFixedWidth(Int64SizeBytes), SpecVariableWidth()}}
-}
+func (t *LargeBinaryType) ID() Type { return
LARGE_BINARY }
+func (t *LargeBinaryType) Name() string { return
"large_binary" }
+func (t *LargeBinaryType) String() string { return
"large_binary" }
+func (t *LargeBinaryType) binary() {}
+func (t *LargeBinaryType) Fingerprint() string { return
typeFingerprint(t) }
+func (t *LargeBinaryType) Layout() DataTypeLayout { return
largeBinaryTypeLayout }
func (t *LargeBinaryType) OffsetTypeTraits() OffsetTraits { return Int64Traits
}
func (LargeBinaryType) IsUtf8() bool { return false }
type LargeStringType struct{}
-func (t *LargeStringType) ID() Type { return LARGE_STRING }
-func (t *LargeStringType) Name() string { return "large_utf8" }
-func (t *LargeStringType) String() string { return "large_utf8" }
-func (t *LargeStringType) binary() {}
-func (t *LargeStringType) Fingerprint() string { return typeFingerprint(t) }
-func (t *LargeStringType) Layout() DataTypeLayout {
- return DataTypeLayout{Buffers: []BufferSpec{SpecBitmap(),
- SpecFixedWidth(Int64SizeBytes), SpecVariableWidth()}}
-}
+func (t *LargeStringType) ID() Type { return
LARGE_STRING }
+func (t *LargeStringType) Name() string { return
"large_utf8" }
+func (t *LargeStringType) String() string { return
"large_utf8" }
+func (t *LargeStringType) binary() {}
+func (t *LargeStringType) Fingerprint() string { return
typeFingerprint(t) }
+func (t *LargeStringType) Layout() DataTypeLayout { return
largeStringTypeLayout }
func (t *LargeStringType) OffsetTypeTraits() OffsetTraits { return Int64Traits
}
func (LargeStringType) IsUtf8() bool { return true }
type BinaryViewType struct{}
-func (*BinaryViewType) ID() Type { return BINARY_VIEW }
-func (*BinaryViewType) Name() string { return "binary_view" }
-func (*BinaryViewType) String() string { return "binary_view" }
-func (*BinaryViewType) IsUtf8() bool { return false }
-func (*BinaryViewType) binary() {}
-func (*BinaryViewType) view() {}
-func (t *BinaryViewType) Fingerprint() string { return typeFingerprint(t) }
-func (*BinaryViewType) Layout() DataTypeLayout {
- variadic := SpecVariableWidth()
- return DataTypeLayout{
- Buffers: []BufferSpec{SpecBitmap(),
SpecFixedWidth(ViewHeaderSizeBytes)},
- VariadicSpec: &variadic,
- }
-}
+func (*BinaryViewType) ID() Type { return BINARY_VIEW }
+func (*BinaryViewType) Name() string { return "binary_view" }
+func (*BinaryViewType) String() string { return "binary_view" }
+func (*BinaryViewType) IsUtf8() bool { return false }
+func (*BinaryViewType) binary() {}
+func (*BinaryViewType) view() {}
+func (t *BinaryViewType) Fingerprint() string { return typeFingerprint(t) }
+func (*BinaryViewType) Layout() DataTypeLayout { return binaryViewTypeLayout }
type StringViewType struct{}
-func (*StringViewType) ID() Type { return STRING_VIEW }
-func (*StringViewType) Name() string { return "string_view" }
-func (*StringViewType) String() string { return "string_view" }
-func (*StringViewType) IsUtf8() bool { return true }
-func (*StringViewType) binary() {}
-func (*StringViewType) view() {}
-func (t *StringViewType) Fingerprint() string { return typeFingerprint(t) }
-func (*StringViewType) Layout() DataTypeLayout {
- variadic := SpecVariableWidth()
- return DataTypeLayout{
- Buffers: []BufferSpec{SpecBitmap(),
SpecFixedWidth(ViewHeaderSizeBytes)},
- VariadicSpec: &variadic,
- }
-}
+func (*StringViewType) ID() Type { return STRING_VIEW }
+func (*StringViewType) Name() string { return "string_view" }
+func (*StringViewType) String() string { return "string_view" }
+func (*StringViewType) IsUtf8() bool { return true }
+func (*StringViewType) binary() {}
+func (*StringViewType) view() {}
+func (t *StringViewType) Fingerprint() string { return typeFingerprint(t) }
+func (*StringViewType) Layout() DataTypeLayout { return stringViewTypeLayout }
var (
BinaryTypes = struct {
diff --git a/arrow/schema.go b/arrow/schema.go
index 1e9e6414..806bd0d0 100644
--- a/arrow/schema.go
+++ b/arrow/schema.go
@@ -169,26 +169,38 @@ func NewSchema(fields []Field, metadata *Metadata)
*Schema {
}
func NewSchemaWithEndian(fields []Field, metadata *Metadata, e
endian.Endianness) *Schema {
+ var mdClone *Metadata
+ if metadata != nil {
+ md := metadata.clone()
+ mdClone = &md
+ }
+
+ fClone := make([]Field, len(fields))
+ copy(fClone, fields)
+
+ return newSchema(fClone, mdClone, e)
+}
+
+func newSchema(fields []Field, metadata *Metadata, e endian.Endianness)
*Schema {
sc := &Schema{
- fields: make([]Field, 0, len(fields)),
+ fields: fields,
index: make(map[string][]int, len(fields)),
endianness: e,
}
if metadata != nil {
- sc.meta = metadata.clone()
+ sc.meta = *metadata
}
for i, field := range fields {
if field.Type == nil {
panic("arrow: field with nil DataType")
}
- sc.fields = append(sc.fields, field)
sc.index[field.Name] = append(sc.index[field.Name], i)
}
return sc
}
func (sc *Schema) WithEndianness(e endian.Endianness) *Schema {
- return NewSchemaWithEndian(sc.fields, &sc.meta, e)
+ return newSchema(sc.fields, &sc.meta, e)
}
func (sc *Schema) Endianness() endian.Endianness { return sc.endianness }
@@ -207,11 +219,17 @@ func (sc *Schema) FieldsByName(n string) ([]Field, bool) {
if !ok {
return nil, ok
}
- fields := make([]Field, 0, len(indices))
- for _, v := range indices {
- fields = append(fields, sc.fields[v])
+ if len(indices) == 1 {
+ return sc.fields[indices[0] : indices[0]+1], ok
+ } else if len(indices) > 1 {
+ fields := make([]Field, 0, len(indices))
+ for _, v := range indices {
+ fields = append(fields, sc.fields[v])
+ }
+ return fields, ok
}
- return fields, ok
+
+ return nil, false
}
// FieldIndices returns the indices of the named field or nil.
@@ -250,11 +268,17 @@ func (s *Schema) AddField(i int, field Field) (*Schema,
error) {
return nil, fmt.Errorf("arrow: invalid field index %d", i)
}
- fields := make([]Field, len(s.fields)+1)
- copy(fields[:i], s.fields[:i])
- fields[i] = field
- copy(fields[i+1:], s.fields[i:])
- return NewSchema(fields, &s.meta), nil
+ var fields []Field
+ if i == len(s.fields) {
+ fields = append(s.fields, field)
+ } else {
+ fields = make([]Field, len(s.fields)+1)
+ copy(fields[:i], s.fields[:i])
+ fields[i] = field
+ copy(fields[i+1:], s.fields[i:])
+ }
+
+ return newSchema(fields, &s.meta, s.endianness), nil
}
func (s *Schema) String() string {