This is an automated email from the ASF dual-hosted git repository.

zeroshade pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new adfd482518 GH-36671: [Go] BinaryMemoTable optimize allocations of 
GetOrInsert (#36811)
adfd482518 is described below

commit adfd482518cf2c7ff620cc1986ad38e51b378e1f
Author: Thor <[email protected]>
AuthorDate: Tue Jul 25 14:09:49 2023 -0500

    GH-36671: [Go] BinaryMemoTable optimize allocations of GetOrInsert (#36811)
    
    
    
    ### Rationale for this change
    
    The hashing.MemoTable provides an interface with
    
        GetOrInsert(val interface{}) (idx int, existed bool, err error)
    
    This can cause a costly allocation for binary dictionaries as is detailed 
in issue https://github.com/apache/arrow/issues/36671
    
    If we expand the MemoTable interface to include:
    
    
        GetOrInsertBytes(val []byte) (idx int, existed bool, err error)
    
    We can avoid the allocations at runtime to convert from `[]byte` to 
`interface{}`
    
    ### What changes are included in this PR?
    
    No logic was changed with the BinaryMemoTable but instead the same API for 
`GetOrInsert` was copied to a type specific API of `GetOrInsertBytes`.
    
    The `BinaryDictionaryBuilder` now simple calls these bytes methods instead 
of the generic `interface{}` ones.
    
    ### Are these changes tested?
    
    No additional tests were included as the current tests exercise this code.
    
    ### Are there any user-facing changes?
    
    * Closes: #36671
    
    Authored-by: thorfour <[email protected]>
    Signed-off-by: Matt Topol <[email protected]>
---
 go/arrow/array/dictionary.go                    | 20 ++++++++--
 go/arrow/array/dictionary_test.go               | 19 ++++++++++
 go/internal/hashing/xxh3_memo_table.gen.go      | 50 +++++++++++++++++++++++++
 go/internal/hashing/xxh3_memo_table.gen.go.tmpl |  6 +++
 go/internal/hashing/xxh3_memo_table.go          | 22 +++++++++++
 5 files changed, 114 insertions(+), 3 deletions(-)

diff --git a/go/arrow/array/dictionary.go b/go/arrow/array/dictionary.go
index da1aea50b2..ccb6f32321 100644
--- a/go/arrow/array/dictionary.go
+++ b/go/arrow/array/dictionary.go
@@ -842,6 +842,11 @@ func (b *dictionaryBuilder) insertDictValue(val 
interface{}) error {
        return err
 }
 
+func (b *dictionaryBuilder) insertDictBytes(val []byte) error {
+       _, _, err := b.memoTable.GetOrInsertBytes(val)
+       return err
+}
+
 func (b *dictionaryBuilder) appendValue(val interface{}) error {
        idx, _, err := b.memoTable.GetOrInsert(val)
        b.idxBuilder.Append(idx)
@@ -849,6 +854,13 @@ func (b *dictionaryBuilder) appendValue(val interface{}) 
error {
        return err
 }
 
+func (b *dictionaryBuilder) appendBytes(val []byte) error {
+       idx, _, err := b.memoTable.GetOrInsertBytes(val)
+       b.idxBuilder.Append(idx)
+       b.length += 1
+       return err
+}
+
 func getvalFn(arr arrow.Array) func(i int) interface{} {
        switch typedarr := arr.(type) {
        case *Int8:
@@ -1285,16 +1297,18 @@ func (b *BinaryDictionaryBuilder) Append(v []byte) 
error {
                b.AppendNull()
                return nil
        }
-       return b.appendValue(v)
+
+       return b.appendBytes(v)
 }
-func (b *BinaryDictionaryBuilder) AppendString(v string) error { return 
b.appendValue(v) }
+
+func (b *BinaryDictionaryBuilder) AppendString(v string) error { return 
b.appendBytes([]byte(v)) }
 func (b *BinaryDictionaryBuilder) InsertDictValues(arr *Binary) (err error) {
        if !arrow.TypeEqual(arr.DataType(), b.dt.ValueType) {
                return fmt.Errorf("dictionary insert type mismatch: cannot 
insert values of type %T to dictionary type %T", arr.DataType(), b.dt.ValueType)
        }
 
        for i := 0; i < arr.Len(); i++ {
-               if err = b.insertDictValue(arr.Value(i)); err != nil {
+               if err = b.insertDictBytes(arr.Value(i)); err != nil {
                        break
                }
        }
diff --git a/go/arrow/array/dictionary_test.go 
b/go/arrow/array/dictionary_test.go
index 8bb9edebf8..99c8e6ffcd 100644
--- a/go/arrow/array/dictionary_test.go
+++ b/go/arrow/array/dictionary_test.go
@@ -19,6 +19,7 @@ package array_test
 import (
        "fmt"
        "math"
+       "math/rand"
        "reflect"
        "strings"
        "testing"
@@ -1846,3 +1847,21 @@ func TestBinaryDictionaryPanic(t *testing.T) {
        }()
        assert.True(t, allocator.paniced)
 }
+
+func BenchmarkBinaryDictionaryBuilder(b *testing.B) {
+       mem := memory.NewCheckedAllocator(memory.DefaultAllocator)
+       defer mem.AssertSize(b, 0)
+
+       dictType := &arrow.DictionaryType{IndexType: &arrow.Int32Type{}, 
ValueType: arrow.BinaryTypes.String}
+       bldr := array.NewDictionaryBuilder(mem, dictType)
+       defer bldr.Release()
+
+       randString := func() string {
+               return fmt.Sprintf("test-%d", rand.Intn(30))
+       }
+
+       builder := bldr.(*array.BinaryDictionaryBuilder)
+       for i := 0; i < b.N; i++ {
+               assert.NoError(b, builder.AppendString(randString()))
+       }
+}
diff --git a/go/internal/hashing/xxh3_memo_table.gen.go 
b/go/internal/hashing/xxh3_memo_table.gen.go
index 0c36aee950..f561c5f30f 100644
--- a/go/internal/hashing/xxh3_memo_table.gen.go
+++ b/go/internal/hashing/xxh3_memo_table.gen.go
@@ -298,6 +298,11 @@ func (s *Int8MemoTable) GetOrInsert(val interface{}) (idx 
int, found bool, err e
        return
 }
 
+// GetOrInsertBytes is unimplemented
+func (s *Int8MemoTable) GetOrInsertBytes(val []byte) (idx int, found bool, err 
error) {
+       panic("unimplemented")
+}
+
 type payloadUint8 struct {
        val     uint8
        memoIdx int32
@@ -570,6 +575,11 @@ func (s *Uint8MemoTable) GetOrInsert(val interface{}) (idx 
int, found bool, err
        return
 }
 
+// GetOrInsertBytes is unimplemented
+func (s *Uint8MemoTable) GetOrInsertBytes(val []byte) (idx int, found bool, 
err error) {
+       panic("unimplemented")
+}
+
 type payloadInt16 struct {
        val     int16
        memoIdx int32
@@ -842,6 +852,11 @@ func (s *Int16MemoTable) GetOrInsert(val interface{}) (idx 
int, found bool, err
        return
 }
 
+// GetOrInsertBytes is unimplemented
+func (s *Int16MemoTable) GetOrInsertBytes(val []byte) (idx int, found bool, 
err error) {
+       panic("unimplemented")
+}
+
 type payloadUint16 struct {
        val     uint16
        memoIdx int32
@@ -1114,6 +1129,11 @@ func (s *Uint16MemoTable) GetOrInsert(val interface{}) 
(idx int, found bool, err
        return
 }
 
+// GetOrInsertBytes is unimplemented
+func (s *Uint16MemoTable) GetOrInsertBytes(val []byte) (idx int, found bool, 
err error) {
+       panic("unimplemented")
+}
+
 type payloadInt32 struct {
        val     int32
        memoIdx int32
@@ -1386,6 +1406,11 @@ func (s *Int32MemoTable) GetOrInsert(val interface{}) 
(idx int, found bool, err
        return
 }
 
+// GetOrInsertBytes is unimplemented
+func (s *Int32MemoTable) GetOrInsertBytes(val []byte) (idx int, found bool, 
err error) {
+       panic("unimplemented")
+}
+
 type payloadInt64 struct {
        val     int64
        memoIdx int32
@@ -1658,6 +1683,11 @@ func (s *Int64MemoTable) GetOrInsert(val interface{}) 
(idx int, found bool, err
        return
 }
 
+// GetOrInsertBytes is unimplemented
+func (s *Int64MemoTable) GetOrInsertBytes(val []byte) (idx int, found bool, 
err error) {
+       panic("unimplemented")
+}
+
 type payloadUint32 struct {
        val     uint32
        memoIdx int32
@@ -1930,6 +1960,11 @@ func (s *Uint32MemoTable) GetOrInsert(val interface{}) 
(idx int, found bool, err
        return
 }
 
+// GetOrInsertBytes is unimplemented
+func (s *Uint32MemoTable) GetOrInsertBytes(val []byte) (idx int, found bool, 
err error) {
+       panic("unimplemented")
+}
+
 type payloadUint64 struct {
        val     uint64
        memoIdx int32
@@ -2202,6 +2237,11 @@ func (s *Uint64MemoTable) GetOrInsert(val interface{}) 
(idx int, found bool, err
        return
 }
 
+// GetOrInsertBytes is unimplemented
+func (s *Uint64MemoTable) GetOrInsertBytes(val []byte) (idx int, found bool, 
err error) {
+       panic("unimplemented")
+}
+
 type payloadFloat32 struct {
        val     float32
        memoIdx int32
@@ -2493,6 +2533,11 @@ func (s *Float32MemoTable) GetOrInsert(val interface{}) 
(idx int, found bool, er
        return
 }
 
+// GetOrInsertBytes is unimplemented
+func (s *Float32MemoTable) GetOrInsertBytes(val []byte) (idx int, found bool, 
err error) {
+       panic("unimplemented")
+}
+
 type payloadFloat64 struct {
        val     float64
        memoIdx int32
@@ -2781,3 +2826,8 @@ func (s *Float64MemoTable) GetOrInsert(val interface{}) 
(idx int, found bool, er
        }
        return
 }
+
+// GetOrInsertBytes is unimplemented
+func (s *Float64MemoTable) GetOrInsertBytes(val []byte) (idx int, found bool, 
err error) {
+       panic("unimplemented")
+}
diff --git a/go/internal/hashing/xxh3_memo_table.gen.go.tmpl 
b/go/internal/hashing/xxh3_memo_table.gen.go.tmpl
index 94c893b94b..10127c43cc 100644
--- a/go/internal/hashing/xxh3_memo_table.gen.go.tmpl
+++ b/go/internal/hashing/xxh3_memo_table.gen.go.tmpl
@@ -340,4 +340,10 @@ func (s *{{.Name}}MemoTable) GetOrInsert(val interface{}) 
(idx int, found bool,
   }
   return
 }
+
+
+// GetOrInsertBytes is unimplemented
+func (s *{{.Name}}MemoTable) GetOrInsertBytes(val []byte) (idx int, found 
bool, err error) {
+    panic("unimplemented")
+}
 {{end}}
diff --git a/go/internal/hashing/xxh3_memo_table.go 
b/go/internal/hashing/xxh3_memo_table.go
index 67e2aef380..81994f0a88 100644
--- a/go/internal/hashing/xxh3_memo_table.go
+++ b/go/internal/hashing/xxh3_memo_table.go
@@ -53,6 +53,12 @@ type MemoTable interface {
        // the table (if false, the value was inserted). An error is returned
        // if val is not the appropriate type for the table.
        GetOrInsert(val interface{}) (idx int, existed bool, err error)
+       // GetOrInsertBytes returns the index of the table the specified value 
is,
+       // and a boolean indicating whether or not the value was found in
+       // the table (if false, the value was inserted). An error is returned
+       // if val is not the appropriate type for the table. This function is 
intended to be used by
+       // the BinaryMemoTable to prevent uncessary allocations of the data 
when converting from a []byte to interface{}.
+       GetOrInsertBytes(val []byte) (idx int, existed bool, err error)
        // GetOrInsertNull returns the index of the null value in the table,
        // inserting one if it hasn't already been inserted. It returns a 
boolean
        // indicating if the null value already existed or not in the table.
@@ -231,6 +237,22 @@ func (b *BinaryMemoTable) Get(val interface{}) (int, bool) 
{
        return KeyNotFound, false
 }
 
+// GetOrInsertBytes returns the index of the given value in the table, if not 
found
+// it is inserted into the table. The return value 'found' indicates whether 
the value
+// was found in the table (true) or inserted (false) along with any possible 
error.
+func (b *BinaryMemoTable) GetOrInsertBytes(val []byte) (idx int, found bool, 
err error) {
+       h := Hash(val, 0)
+       p, found := b.lookup(h, val)
+       if found {
+               idx = int(p.payload.val)
+       } else {
+               idx = b.Size()
+               b.builder.Append(val)
+               b.tbl.Insert(p, h, int32(idx), -1)
+       }
+       return
+}
+
 // GetOrInsert returns the index of the given value in the table, if not found
 // it is inserted into the table. The return value 'found' indicates whether 
the value
 // was found in the table (true) or inserted (false) along with any possible 
error.

Reply via email to