This is an automated email from the ASF dual-hosted git repository.
zeroshade pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new adfd482518 GH-36671: [Go] BinaryMemoTable optimize allocations of
GetOrInsert (#36811)
adfd482518 is described below
commit adfd482518cf2c7ff620cc1986ad38e51b378e1f
Author: Thor <[email protected]>
AuthorDate: Tue Jul 25 14:09:49 2023 -0500
GH-36671: [Go] BinaryMemoTable optimize allocations of GetOrInsert (#36811)
### Rationale for this change
The hashing.MemoTable provides an interface with
GetOrInsert(val interface{}) (idx int, existed bool, err error)
This can cause a costly allocation for binary dictionaries as is detailed
in issue https://github.com/apache/arrow/issues/36671
If we expand the MemoTable interface to include:
GetOrInsertBytes(val []byte) (idx int, existed bool, err error)
We can avoid the allocations at runtime to convert from `[]byte` to
`interface{}`
### What changes are included in this PR?
No logic was changed with the BinaryMemoTable but instead the same API for
`GetOrInsert` was copied to a type specific API of `GetOrInsertBytes`.
The `BinaryDictionaryBuilder` now simple calls these bytes methods instead
of the generic `interface{}` ones.
### Are these changes tested?
No additional tests were included as the current tests exercise this code.
### Are there any user-facing changes?
* Closes: #36671
Authored-by: thorfour <[email protected]>
Signed-off-by: Matt Topol <[email protected]>
---
go/arrow/array/dictionary.go | 20 ++++++++--
go/arrow/array/dictionary_test.go | 19 ++++++++++
go/internal/hashing/xxh3_memo_table.gen.go | 50 +++++++++++++++++++++++++
go/internal/hashing/xxh3_memo_table.gen.go.tmpl | 6 +++
go/internal/hashing/xxh3_memo_table.go | 22 +++++++++++
5 files changed, 114 insertions(+), 3 deletions(-)
diff --git a/go/arrow/array/dictionary.go b/go/arrow/array/dictionary.go
index da1aea50b2..ccb6f32321 100644
--- a/go/arrow/array/dictionary.go
+++ b/go/arrow/array/dictionary.go
@@ -842,6 +842,11 @@ func (b *dictionaryBuilder) insertDictValue(val
interface{}) error {
return err
}
+func (b *dictionaryBuilder) insertDictBytes(val []byte) error {
+ _, _, err := b.memoTable.GetOrInsertBytes(val)
+ return err
+}
+
func (b *dictionaryBuilder) appendValue(val interface{}) error {
idx, _, err := b.memoTable.GetOrInsert(val)
b.idxBuilder.Append(idx)
@@ -849,6 +854,13 @@ func (b *dictionaryBuilder) appendValue(val interface{})
error {
return err
}
+func (b *dictionaryBuilder) appendBytes(val []byte) error {
+ idx, _, err := b.memoTable.GetOrInsertBytes(val)
+ b.idxBuilder.Append(idx)
+ b.length += 1
+ return err
+}
+
func getvalFn(arr arrow.Array) func(i int) interface{} {
switch typedarr := arr.(type) {
case *Int8:
@@ -1285,16 +1297,18 @@ func (b *BinaryDictionaryBuilder) Append(v []byte)
error {
b.AppendNull()
return nil
}
- return b.appendValue(v)
+
+ return b.appendBytes(v)
}
-func (b *BinaryDictionaryBuilder) AppendString(v string) error { return
b.appendValue(v) }
+
+func (b *BinaryDictionaryBuilder) AppendString(v string) error { return
b.appendBytes([]byte(v)) }
func (b *BinaryDictionaryBuilder) InsertDictValues(arr *Binary) (err error) {
if !arrow.TypeEqual(arr.DataType(), b.dt.ValueType) {
return fmt.Errorf("dictionary insert type mismatch: cannot
insert values of type %T to dictionary type %T", arr.DataType(), b.dt.ValueType)
}
for i := 0; i < arr.Len(); i++ {
- if err = b.insertDictValue(arr.Value(i)); err != nil {
+ if err = b.insertDictBytes(arr.Value(i)); err != nil {
break
}
}
diff --git a/go/arrow/array/dictionary_test.go
b/go/arrow/array/dictionary_test.go
index 8bb9edebf8..99c8e6ffcd 100644
--- a/go/arrow/array/dictionary_test.go
+++ b/go/arrow/array/dictionary_test.go
@@ -19,6 +19,7 @@ package array_test
import (
"fmt"
"math"
+ "math/rand"
"reflect"
"strings"
"testing"
@@ -1846,3 +1847,21 @@ func TestBinaryDictionaryPanic(t *testing.T) {
}()
assert.True(t, allocator.paniced)
}
+
+func BenchmarkBinaryDictionaryBuilder(b *testing.B) {
+ mem := memory.NewCheckedAllocator(memory.DefaultAllocator)
+ defer mem.AssertSize(b, 0)
+
+ dictType := &arrow.DictionaryType{IndexType: &arrow.Int32Type{},
ValueType: arrow.BinaryTypes.String}
+ bldr := array.NewDictionaryBuilder(mem, dictType)
+ defer bldr.Release()
+
+ randString := func() string {
+ return fmt.Sprintf("test-%d", rand.Intn(30))
+ }
+
+ builder := bldr.(*array.BinaryDictionaryBuilder)
+ for i := 0; i < b.N; i++ {
+ assert.NoError(b, builder.AppendString(randString()))
+ }
+}
diff --git a/go/internal/hashing/xxh3_memo_table.gen.go
b/go/internal/hashing/xxh3_memo_table.gen.go
index 0c36aee950..f561c5f30f 100644
--- a/go/internal/hashing/xxh3_memo_table.gen.go
+++ b/go/internal/hashing/xxh3_memo_table.gen.go
@@ -298,6 +298,11 @@ func (s *Int8MemoTable) GetOrInsert(val interface{}) (idx
int, found bool, err e
return
}
+// GetOrInsertBytes is unimplemented
+func (s *Int8MemoTable) GetOrInsertBytes(val []byte) (idx int, found bool, err
error) {
+ panic("unimplemented")
+}
+
type payloadUint8 struct {
val uint8
memoIdx int32
@@ -570,6 +575,11 @@ func (s *Uint8MemoTable) GetOrInsert(val interface{}) (idx
int, found bool, err
return
}
+// GetOrInsertBytes is unimplemented
+func (s *Uint8MemoTable) GetOrInsertBytes(val []byte) (idx int, found bool,
err error) {
+ panic("unimplemented")
+}
+
type payloadInt16 struct {
val int16
memoIdx int32
@@ -842,6 +852,11 @@ func (s *Int16MemoTable) GetOrInsert(val interface{}) (idx
int, found bool, err
return
}
+// GetOrInsertBytes is unimplemented
+func (s *Int16MemoTable) GetOrInsertBytes(val []byte) (idx int, found bool,
err error) {
+ panic("unimplemented")
+}
+
type payloadUint16 struct {
val uint16
memoIdx int32
@@ -1114,6 +1129,11 @@ func (s *Uint16MemoTable) GetOrInsert(val interface{})
(idx int, found bool, err
return
}
+// GetOrInsertBytes is unimplemented
+func (s *Uint16MemoTable) GetOrInsertBytes(val []byte) (idx int, found bool,
err error) {
+ panic("unimplemented")
+}
+
type payloadInt32 struct {
val int32
memoIdx int32
@@ -1386,6 +1406,11 @@ func (s *Int32MemoTable) GetOrInsert(val interface{})
(idx int, found bool, err
return
}
+// GetOrInsertBytes is unimplemented
+func (s *Int32MemoTable) GetOrInsertBytes(val []byte) (idx int, found bool,
err error) {
+ panic("unimplemented")
+}
+
type payloadInt64 struct {
val int64
memoIdx int32
@@ -1658,6 +1683,11 @@ func (s *Int64MemoTable) GetOrInsert(val interface{})
(idx int, found bool, err
return
}
+// GetOrInsertBytes is unimplemented
+func (s *Int64MemoTable) GetOrInsertBytes(val []byte) (idx int, found bool,
err error) {
+ panic("unimplemented")
+}
+
type payloadUint32 struct {
val uint32
memoIdx int32
@@ -1930,6 +1960,11 @@ func (s *Uint32MemoTable) GetOrInsert(val interface{})
(idx int, found bool, err
return
}
+// GetOrInsertBytes is unimplemented
+func (s *Uint32MemoTable) GetOrInsertBytes(val []byte) (idx int, found bool,
err error) {
+ panic("unimplemented")
+}
+
type payloadUint64 struct {
val uint64
memoIdx int32
@@ -2202,6 +2237,11 @@ func (s *Uint64MemoTable) GetOrInsert(val interface{})
(idx int, found bool, err
return
}
+// GetOrInsertBytes is unimplemented
+func (s *Uint64MemoTable) GetOrInsertBytes(val []byte) (idx int, found bool,
err error) {
+ panic("unimplemented")
+}
+
type payloadFloat32 struct {
val float32
memoIdx int32
@@ -2493,6 +2533,11 @@ func (s *Float32MemoTable) GetOrInsert(val interface{})
(idx int, found bool, er
return
}
+// GetOrInsertBytes is unimplemented
+func (s *Float32MemoTable) GetOrInsertBytes(val []byte) (idx int, found bool,
err error) {
+ panic("unimplemented")
+}
+
type payloadFloat64 struct {
val float64
memoIdx int32
@@ -2781,3 +2826,8 @@ func (s *Float64MemoTable) GetOrInsert(val interface{})
(idx int, found bool, er
}
return
}
+
+// GetOrInsertBytes is unimplemented
+func (s *Float64MemoTable) GetOrInsertBytes(val []byte) (idx int, found bool,
err error) {
+ panic("unimplemented")
+}
diff --git a/go/internal/hashing/xxh3_memo_table.gen.go.tmpl
b/go/internal/hashing/xxh3_memo_table.gen.go.tmpl
index 94c893b94b..10127c43cc 100644
--- a/go/internal/hashing/xxh3_memo_table.gen.go.tmpl
+++ b/go/internal/hashing/xxh3_memo_table.gen.go.tmpl
@@ -340,4 +340,10 @@ func (s *{{.Name}}MemoTable) GetOrInsert(val interface{})
(idx int, found bool,
}
return
}
+
+
+// GetOrInsertBytes is unimplemented
+func (s *{{.Name}}MemoTable) GetOrInsertBytes(val []byte) (idx int, found
bool, err error) {
+ panic("unimplemented")
+}
{{end}}
diff --git a/go/internal/hashing/xxh3_memo_table.go
b/go/internal/hashing/xxh3_memo_table.go
index 67e2aef380..81994f0a88 100644
--- a/go/internal/hashing/xxh3_memo_table.go
+++ b/go/internal/hashing/xxh3_memo_table.go
@@ -53,6 +53,12 @@ type MemoTable interface {
// the table (if false, the value was inserted). An error is returned
// if val is not the appropriate type for the table.
GetOrInsert(val interface{}) (idx int, existed bool, err error)
+ // GetOrInsertBytes returns the index of the table the specified value
is,
+ // and a boolean indicating whether or not the value was found in
+ // the table (if false, the value was inserted). An error is returned
+ // if val is not the appropriate type for the table. This function is
intended to be used by
+ // the BinaryMemoTable to prevent uncessary allocations of the data
when converting from a []byte to interface{}.
+ GetOrInsertBytes(val []byte) (idx int, existed bool, err error)
// GetOrInsertNull returns the index of the null value in the table,
// inserting one if it hasn't already been inserted. It returns a
boolean
// indicating if the null value already existed or not in the table.
@@ -231,6 +237,22 @@ func (b *BinaryMemoTable) Get(val interface{}) (int, bool)
{
return KeyNotFound, false
}
+// GetOrInsertBytes returns the index of the given value in the table, if not
found
+// it is inserted into the table. The return value 'found' indicates whether
the value
+// was found in the table (true) or inserted (false) along with any possible
error.
+func (b *BinaryMemoTable) GetOrInsertBytes(val []byte) (idx int, found bool,
err error) {
+ h := Hash(val, 0)
+ p, found := b.lookup(h, val)
+ if found {
+ idx = int(p.payload.val)
+ } else {
+ idx = b.Size()
+ b.builder.Append(val)
+ b.tbl.Insert(p, h, int32(idx), -1)
+ }
+ return
+}
+
// GetOrInsert returns the index of the given value in the table, if not found
// it is inserted into the table. The return value 'found' indicates whether
the value
// was found in the table (true) or inserted (false) along with any possible
error.