This is an automated email from the ASF dual-hosted git repository.
zeroshade pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 946bdcf ARROW-13963: [Go] Minor: Add bitmap reader/writer impl from
go Parquet module to Arrow Bitutil package
946bdcf is described below
commit 946bdcf83611c08ca90610467cc862746f9622bd
Author: Matthew Topol <[email protected]>
AuthorDate: Thu Sep 9 14:25:54 2021 -0400
ARROW-13963: [Go] Minor: Add bitmap reader/writer impl from go Parquet
module to Arrow Bitutil package
This will be followed by a PR for ARROW-13964 which will point the Parquet
package to use the Arrow bitutil shared implementations. This will let further
changes use the same implementations instead of copying the implementation and
having it in both packages.
Closes #11124 from zeroshade/bitutils
Authored-by: Matthew Topol <[email protected]>
Signed-off-by: Matthew Topol <[email protected]>
---
go/arrow/bitutil/bitmaps.go | 435 +++++++++++++++++++++++++++++++++++++++
go/arrow/bitutil/bitmaps_test.go | 244 ++++++++++++++++++++++
go/arrow/bitutil/bitutil.go | 61 ++++++
go/arrow/bitutil/bitutil_test.go | 33 +++
go/arrow/go.sum | 4 -
5 files changed, 773 insertions(+), 4 deletions(-)
diff --git a/go/arrow/bitutil/bitmaps.go b/go/arrow/bitutil/bitmaps.go
new file mode 100644
index 0000000..206cde7
--- /dev/null
+++ b/go/arrow/bitutil/bitmaps.go
@@ -0,0 +1,435 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package bitutil
+
+import (
+ "math/bits"
+ "unsafe"
+
+ "github.com/apache/arrow/go/arrow/endian"
+ "github.com/apache/arrow/go/arrow/internal/debug"
+)
+
+// helper function to handle big-endian architectures properly
+var toFromLEFunc func(uint64) uint64
+
+func init() {
+ if endian.IsBigEndian {
+ // if we're on a big endian architecture, then use the reverse
bytes
+ // function so we can perform byte-swaps when necessary
+ toFromLEFunc = bits.ReverseBytes64
+ } else {
+ // identity function if we're on a little endian architecture
+ toFromLEFunc = func(in uint64) uint64 { return in }
+ }
+}
+
+// BitmapReader is a simple bitmap reader for a byte slice.
+type BitmapReader struct {
+ bitmap []byte
+ pos int
+ len int
+
+ current byte
+ byteOffset int
+ bitOffset int
+}
+
+// NewBitmapReader creates and returns a new bitmap reader for the given bitmap
+func NewBitmapReader(bitmap []byte, offset, length int) *BitmapReader {
+ curbyte := byte(0)
+ if length > 0 && bitmap != nil {
+ curbyte = bitmap[offset/8]
+ }
+ return &BitmapReader{
+ bitmap: bitmap,
+ byteOffset: offset / 8,
+ bitOffset: offset % 8,
+ current: curbyte,
+ len: length,
+ }
+}
+
+// Set returns true if the current bit is set
+func (b *BitmapReader) Set() bool {
+ return (b.current & (1 << b.bitOffset)) != 0
+}
+
+// NotSet returns true if the current bit is not set
+func (b *BitmapReader) NotSet() bool {
+ return (b.current & (1 << b.bitOffset)) == 0
+}
+
+// Next advances the reader to the next bit in the bitmap.
+func (b *BitmapReader) Next() {
+ b.bitOffset++
+ b.pos++
+ if b.bitOffset == 8 {
+ b.bitOffset = 0
+ b.byteOffset++
+ if b.pos < b.len {
+ b.current = b.bitmap[int(b.byteOffset)]
+ }
+ }
+}
+
+// Pos returns the current bit position in the bitmap that the reader is
looking at
+func (b *BitmapReader) Pos() int { return b.pos }
+
+// Len returns the total number of bits in the bitmap
+func (b *BitmapReader) Len() int { return b.len }
+
+// BitmapWriter is a simple writer for writing bitmaps to byte slices
+type BitmapWriter struct {
+ buf []byte
+ pos int
+ length int
+
+ curByte uint8
+ bitMask uint8
+ byteOffset int
+}
+
+// NewBitmapWriter returns a sequential bitwise writer that preserves
surrounding
+// bit values as it writes.
+func NewBitmapWriter(bitmap []byte, start, length int) *BitmapWriter {
+ ret := &BitmapWriter{
+ buf: bitmap,
+ length: length,
+ byteOffset: start / 8,
+ bitMask: BitMask[start%8],
+ }
+ if length > 0 {
+ ret.curByte = bitmap[int(ret.byteOffset)]
+ }
+ return ret
+}
+
+// Reset resets the position and view of the slice to restart writing a bitmap
+// to the same byte slice.
+func (b *BitmapWriter) Reset(start, length int) {
+ b.pos = 0
+ b.byteOffset = start / 8
+ b.bitMask = BitMask[start%8]
+ b.length = length
+ if b.length > 0 {
+ b.curByte = b.buf[int(b.byteOffset)]
+ }
+}
+
+func (b *BitmapWriter) Pos() int { return b.pos }
+func (b *BitmapWriter) Set() { b.curByte |= b.bitMask }
+func (b *BitmapWriter) Clear() { b.curByte &= ^b.bitMask }
+
+// Next increments the writer to the next bit for writing.
+func (b *BitmapWriter) Next() {
+ b.bitMask = b.bitMask << 1
+ b.pos++
+ if b.bitMask == 0 {
+ b.bitMask = 0x01
+ b.buf[b.byteOffset] = b.curByte
+ b.byteOffset++
+ if b.pos < b.length {
+ b.curByte = b.buf[int(b.byteOffset)]
+ }
+ }
+}
+
+// AppendBools writes a series of booleans to the bitmapwriter and returns
+// the number of remaining bytes left in the buffer for writing.
+func (b *BitmapWriter) AppendBools(in []bool) int {
+ space := min(int(BytesForBits(int64(b.length-b.pos))), len(in))
+
+ // location that the first byte needs to be written to for appending
+ appslice := b.buf[int(b.byteOffset):]
+ // update everything but curByte
+ bitOffset := bits.TrailingZeros32(uint32(b.bitMask))
+ appslice[0] = b.curByte
+ for i, b := range in[:space] {
+ if b {
+ SetBit(appslice, i)
+ } else {
+ ClearBit(appslice, i)
+ }
+ }
+
+ b.pos += space
+ b.bitMask = BitMask[(bitOffset+space)%8]
+ b.byteOffset += (bitOffset + space) / 8
+ b.curByte = appslice[len(appslice)-1]
+
+ return int(space)
+}
+
+// Finish flushes the final byte out to the byteslice in case it was not
already
+// on a byte aligned boundary.
+func (b *BitmapWriter) Finish() {
+ if b.length > 0 && (b.bitMask != 0x01 || b.pos < b.length) {
+ b.buf[int(b.byteOffset)] = b.curByte
+ }
+}
+
+// BitmapWordReader is a reader for bitmaps that reads a word at a time (a
word being an 8 byte uint64)
+// and then provides functions to grab the individual trailing bytes after the
last word
+type BitmapWordReader struct {
+ bitmap []byte
+ offset int
+ nwords int
+ trailingBits int
+ trailingBytes int
+ curword uint64
+}
+
+// NewBitmapWordReader sets up a word reader, calculates the number of
trailing bits and
+// number of trailing bytes, along with the number of words.
+func NewBitmapWordReader(bitmap []byte, offset, length int) *BitmapWordReader {
+ bitoffset := offset % 8
+ byteOffset := offset / 8
+ bm := &BitmapWordReader{
+ offset: bitoffset,
+ bitmap: bitmap[byteOffset :
byteOffset+int(BytesForBits(int64(bitoffset+length)))],
+ // decrement wordcount by 1 as we may touch two adjacent words
in one iteration
+ nwords: length/int(unsafe.Sizeof(uint64(0))*8) - 1,
+ }
+ if bm.nwords < 0 {
+ bm.nwords = 0
+ }
+ bm.trailingBits = length - bm.nwords*int(unsafe.Sizeof(uint64(0)))*8
+ bm.trailingBytes = int(BytesForBits(int64(bm.trailingBits)))
+
+ if bm.nwords > 0 {
+ bm.curword = toFromLEFunc(endian.Native.Uint64(bm.bitmap))
+ } else {
+ bm.curword = toFromLEFunc(uint64(bm.bitmap[0]))
+ }
+ return bm
+}
+
+// NextWord returns the next full word read from the bitmap, should not be
called
+// if Words() is 0 as it will step outside of the bounds of the bitmap slice
and panic.
+//
+// We don't perform the bounds checking in order to improve performance.
+func (bm *BitmapWordReader) NextWord() uint64 {
+ bm.bitmap = bm.bitmap[unsafe.Sizeof(bm.curword):]
+ word := bm.curword
+ nextWord := toFromLEFunc(endian.Native.Uint64(bm.bitmap))
+ if bm.offset != 0 {
+ // combine two adjacent words into one word
+ // |<------ next ----->|<---- current ---->|
+ // +-------------+-----+-------------+-----+
+ // | --- | A | B | --- |
+ // +-------------+-----+-------------+-----+
+ // | | offset
+ // v v
+ // +-----+-------------+
+ // | A | B |
+ // +-----+-------------+
+ // |<------ word ----->|
+ word >>= uint64(bm.offset)
+ word |= nextWord << (int64(unsafe.Sizeof(uint64(0))*8) -
int64(bm.offset))
+ }
+ bm.curword = nextWord
+ return word
+}
+
+// NextTrailingByte returns the next trailing byte of the bitmap after the
last word
+// along with the number of valid bits in that byte. When validBits < 8, that
+// is the last byte.
+//
+// If the bitmap ends on a byte alignment, then the last byte can also return
8 valid bits.
+// Thus the TrailingBytes function should be used to know how many trailing
bytes to read.
+func (bm *BitmapWordReader) NextTrailingByte() (val byte, validBits int) {
+ debug.Assert(bm.trailingBits > 0, "next trailing byte called with no
trailing bits")
+
+ if bm.trailingBits <= 8 {
+ // last byte
+ validBits = bm.trailingBits
+ bm.trailingBits = 0
+ rdr := NewBitmapReader(bm.bitmap, bm.offset, validBits)
+ for i := 0; i < validBits; i++ {
+ val >>= 1
+ if rdr.Set() {
+ val |= 0x80
+ }
+ rdr.Next()
+ }
+ val >>= (8 - validBits)
+ return
+ }
+
+ bm.bitmap = bm.bitmap[1:]
+ nextByte := bm.bitmap[0]
+ val = (*[8]byte)(unsafe.Pointer(&bm.curword))[0]
+ if bm.offset != 0 {
+ val >>= byte(bm.offset)
+ val |= nextByte << (8 - bm.offset)
+ }
+ (*[8]byte)(unsafe.Pointer(&bm.curword))[0] = nextByte
+ bm.trailingBits -= 8
+ bm.trailingBytes--
+ validBits = 8
+ return
+}
+
+func (bm *BitmapWordReader) Words() int { return bm.nwords }
+func (bm *BitmapWordReader) TrailingBytes() int { return bm.trailingBytes }
+
+// BitmapWordWriter is a bitmap writer for writing a full word at a time (a
word being
+// a uint64). After the last full word is written, PutNextTrailingByte can be
used to
+// write the remaining trailing bytes.
+type BitmapWordWriter struct {
+ bitmap []byte
+ offset int
+ len int
+
+ bitMask uint64
+ currentWord uint64
+}
+
+// NewBitmapWordWriter initializes a new bitmap word writer which will start
writing
+// into the byte slice at bit offset start, expecting to write len bits.
+func NewBitmapWordWriter(bitmap []byte, start, len int) *BitmapWordWriter {
+ ret := &BitmapWordWriter{
+ bitmap: bitmap[start/8:],
+ len: len,
+ offset: start % 8,
+ bitMask: (uint64(1) << uint64(start%8)) - 1,
+ }
+
+ if ret.offset != 0 {
+ if ret.len >= int(unsafe.Sizeof(uint64(0))*8) {
+ ret.currentWord =
toFromLEFunc(endian.Native.Uint64(ret.bitmap))
+ } else if ret.len > 0 {
+ ret.currentWord = toFromLEFunc(uint64(ret.bitmap[0]))
+ }
+ }
+ return ret
+}
+
+// PutNextWord writes the given word to the bitmap, potentially splitting
across
+// two adjacent words.
+func (bm *BitmapWordWriter) PutNextWord(word uint64) {
+ sz := int(unsafe.Sizeof(word))
+ if bm.offset != 0 {
+ // split one word into two adjacent words, don't touch unused
bits
+ // |<------ word ----->|
+ // +-----+-------------+
+ // | A | B |
+ // +-----+-------------+
+ // | |
+ // v v offset
+ // +-------------+-----+-------------+-----+
+ // | --- | A | B | --- |
+ // +-------------+-----+-------------+-----+
+ // |<------ next ----->|<---- current ---->|
+ word = (word << uint64(bm.offset)) | (word >> (int64(sz*8) -
int64(bm.offset)))
+ next := endian.Native.Uint64(bm.bitmap[sz:])
+ bm.currentWord = (bm.currentWord & bm.bitMask) | (word &^
bm.bitMask)
+ next = (next &^ bm.bitMask) | (word & bm.bitMask)
+ endian.Native.PutUint64(bm.bitmap, toFromLEFunc(bm.currentWord))
+ endian.Native.PutUint64(bm.bitmap[sz:], toFromLEFunc(next))
+ bm.currentWord = next
+ } else {
+ endian.Native.PutUint64(bm.bitmap, toFromLEFunc(word))
+ }
+ bm.bitmap = bm.bitmap[sz:]
+}
+
+// PutNextTrailingByte writes the number of bits indicated by validBits from b
to
+// the bitmap.
+func (bm *BitmapWordWriter) PutNextTrailingByte(b byte, validBits int) {
+ curbyte := (*[8]byte)(unsafe.Pointer(&bm.currentWord))[0]
+ if validBits == 8 {
+ if bm.offset != 0 {
+ b = (b << bm.offset) | (b >> (8 - bm.offset))
+ next := bm.bitmap[1]
+ curbyte = (curbyte & byte(bm.bitMask)) | (b &^
byte(bm.bitMask))
+ next = (next &^ byte(bm.bitMask)) | (b &
byte(bm.bitMask))
+ bm.bitmap[0] = curbyte
+ bm.bitmap[1] = next
+ bm.currentWord = uint64(next)
+ } else {
+ bm.bitmap[0] = b
+ }
+ bm.bitmap = bm.bitmap[1:]
+ } else {
+ debug.Assert(validBits > 0 && validBits < 8, "invalid valid
bits in bitmap word writer")
+ debug.Assert(BytesForBits(int64(bm.offset+validBits)) <=
int64(len(bm.bitmap)), "writing trailiing byte outside of bounds of bitmap")
+ wr := NewBitmapWriter(bm.bitmap, int(bm.offset), validBits)
+ for i := 0; i < validBits; i++ {
+ if b&0x01 != 0 {
+ wr.Set()
+ } else {
+ wr.Clear()
+ }
+ wr.Next()
+ b >>= 1
+ }
+ wr.Finish()
+ }
+}
+
+// CopyBitmap copies the bitmap indicated by src, starting at bit offset
srcOffset,
+// and copying length bits into dst, starting at bit offset dstOffset.
+func CopyBitmap(src []byte, srcOffset, length int, dst []byte, dstOffset int) {
+ if length == 0 {
+ // if there's nothing to write, end early.
+ return
+ }
+
+ bitOffset := srcOffset % 8
+ destBitOffset := dstOffset % 8
+
+ // slow path, one of the bitmaps are not byte aligned.
+ if bitOffset != 0 || destBitOffset != 0 {
+ rdr := NewBitmapWordReader(src, srcOffset, length)
+ wr := NewBitmapWordWriter(dst, dstOffset, length)
+
+ nwords := rdr.Words()
+ for nwords > 0 {
+ nwords--
+ wr.PutNextWord(rdr.NextWord())
+ }
+ nbytes := rdr.TrailingBytes()
+ for nbytes > 0 {
+ nbytes--
+ bt, validBits := rdr.NextTrailingByte()
+ wr.PutNextTrailingByte(bt, validBits)
+ }
+ return
+ }
+
+ // fast path, both are starting with byte-aligned bitmaps
+ nbytes := int(BytesForBits(int64(length)))
+
+ // shift by its byte offset
+ src = src[srcOffset/8:]
+ dst = dst[dstOffset/8:]
+
+ // Take care of the trailing bits in the last byte
+ // E.g., if trailing_bits = 5, last byte should be
+ // - low 3 bits: new bits from last byte of data buffer
+ // - high 5 bits: old bits from last byte of dest buffer
+ trailingBits := nbytes*8 - length
+ trailMask := byte(uint(1)<<(8-trailingBits)) - 1
+
+ copy(dst, src[:nbytes-1])
+ lastData := src[nbytes-1]
+
+ dst[nbytes-1] &= ^trailMask
+ dst[nbytes-1] |= lastData & trailMask
+}
diff --git a/go/arrow/bitutil/bitmaps_test.go b/go/arrow/bitutil/bitmaps_test.go
new file mode 100644
index 0000000..419fc1b
--- /dev/null
+++ b/go/arrow/bitutil/bitmaps_test.go
@@ -0,0 +1,244 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package bitutil_test
+
+import (
+ "fmt"
+ "math/rand"
+ "strconv"
+ "testing"
+
+ "github.com/apache/arrow/go/arrow/bitutil"
+ "github.com/stretchr/testify/assert"
+)
+
+func bitmapFromSlice(vals []int, bitOffset int) []byte {
+ out := make([]byte,
int(bitutil.BytesForBits(int64(len(vals)+bitOffset))))
+ writer := bitutil.NewBitmapWriter(out, bitOffset, len(vals))
+ for _, val := range vals {
+ if val == 1 {
+ writer.Set()
+ } else {
+ writer.Clear()
+ }
+ writer.Next()
+ }
+ writer.Finish()
+
+ return out
+}
+
+func assertReaderVals(t *testing.T, reader *bitutil.BitmapReader, vals []bool)
{
+ for _, v := range vals {
+ if v {
+ assert.True(t, reader.Set())
+ assert.False(t, reader.NotSet())
+ } else {
+ assert.True(t, reader.NotSet())
+ assert.False(t, reader.Set())
+ }
+ reader.Next()
+ }
+}
+
+func TestNormalOperation(t *testing.T) {
+ for _, offset := range []int{0, 1, 3, 5, 7, 8, 12, 13, 21, 38, 75, 120}
{
+ buf := bitmapFromSlice([]int{0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0,
1, 0, 1}, offset)
+
+ reader := bitutil.NewBitmapReader(buf, offset, 14)
+ assertReaderVals(t, reader, []bool{false, true, true, true,
false, false, false, true, false, true, false, true, false, true})
+ }
+}
+
+func TestDoesNotReadOutOfBounds(t *testing.T) {
+ var bitmap [16]byte
+ const length = 128
+
+ reader := bitutil.NewBitmapReader(bitmap[:], 0, length)
+ assert.EqualValues(t, length, reader.Len())
+ assert.NotPanics(t, func() {
+ for i := 0; i < length; i++ {
+ assert.True(t, reader.NotSet())
+ reader.Next()
+ }
+ })
+ assert.EqualValues(t, length, reader.Pos())
+
+ reader = bitutil.NewBitmapReader(bitmap[:], 5, length-5)
+ assert.EqualValues(t, length-5, reader.Len())
+ assert.NotPanics(t, func() {
+ for i := 0; i < length-5; i++ {
+ assert.True(t, reader.NotSet())
+ reader.Next()
+ }
+ })
+ assert.EqualValues(t, length-5, reader.Pos())
+
+ assert.NotPanics(t, func() {
+ reader = bitutil.NewBitmapReader(nil, 0, 0)
+ })
+}
+
+func writeToWriter(vals []int, wr *bitutil.BitmapWriter) {
+ for _, v := range vals {
+ if v != 0 {
+ wr.Set()
+ } else {
+ wr.Clear()
+ }
+ wr.Next()
+ }
+ wr.Finish()
+}
+
+func TestBitmapWriter(t *testing.T) {
+ for _, fillByte := range []byte{0x00, 0xFF} {
+ {
+ bitmap := []byte{fillByte, fillByte, fillByte, fillByte}
+ wr := bitutil.NewBitmapWriter(bitmap, 0, 12)
+ writeToWriter([]int{0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0,
1}, wr)
+ // {0b00110110, 0b....1010, ........, ........}
+ assert.Equal(t, []byte{0x36, (0x0A | (fillByte &
0xF0)), fillByte, fillByte}, bitmap)
+ }
+ {
+ bitmap := []byte{fillByte, fillByte, fillByte, fillByte}
+ wr := bitutil.NewBitmapWriter(bitmap, 3, 12)
+ writeToWriter([]int{0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0,
1}, wr)
+ // {0b10110..., 0b.1010001, ........, ........}
+ assert.Equal(t, []byte{0xb0 | (fillByte & 0x07), 0x51 |
(fillByte & 0x80), fillByte, fillByte}, bitmap)
+ }
+ {
+ bitmap := []byte{fillByte, fillByte, fillByte, fillByte}
+ wr := bitutil.NewBitmapWriter(bitmap, 20, 12)
+ writeToWriter([]int{0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0,
1}, wr)
+ // {........, ........, 0b0110...., 0b10100011}
+ assert.Equal(t, []byte{fillByte, fillByte, 0x60 |
(fillByte & 0x0f), 0xa3}, bitmap)
+ }
+ }
+}
+
+func TestBitmapReader(t *testing.T) {
+ assertReaderVals := func(vals []int, rdr *bitutil.BitmapReader) {
+ for _, v := range vals {
+ if v != 0 {
+ assert.True(t, rdr.Set())
+ assert.False(t, rdr.NotSet())
+ } else {
+ assert.False(t, rdr.Set())
+ assert.True(t, rdr.NotSet())
+ }
+ rdr.Next()
+ }
+ }
+
+ vals := []int{0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1}
+
+ for _, offset := range []int{0, 1, 3, 5, 7, 8, 12, 13, 21, 38, 75, 120}
{
+ bm := make([]byte,
bitutil.BytesForBits(int64(len(vals)+offset)))
+ wr := bitutil.NewBitmapWriter(bm, offset, len(vals))
+ writeToWriter(vals, wr)
+
+ rdr := bitutil.NewBitmapReader(bm, offset, 14)
+ assertReaderVals(vals, rdr)
+ }
+}
+
+func TestCopyBitmap(t *testing.T) {
+ const bufsize = 1000
+ lengths := []int{bufsize*8 - 4, bufsize * 8}
+ offsets := []int{0, 12, 16, 32, 37, 63, 64, 128}
+
+ buffer := make([]byte, bufsize)
+
+ // random bytes
+ r := rand.New(rand.NewSource(0))
+ r.Read(buffer)
+
+ // add 16 byte padding
+ otherBuffer := make([]byte, bufsize+32)
+ r.Read(otherBuffer)
+
+ for _, nbits := range lengths {
+ for _, offset := range offsets {
+ for _, destOffset := range offsets {
+ t.Run(fmt.Sprintf("bits %d off %d dst %d",
nbits, offset, destOffset), func(t *testing.T) {
+ copyLen := nbits - offset
+
+ bmCopy := make([]byte, len(otherBuffer))
+ copy(bmCopy, otherBuffer)
+
+ bitutil.CopyBitmap(buffer, offset,
copyLen, bmCopy, destOffset)
+
+ for i := 0; i < int(destOffset); i++ {
+ assert.Equalf(t,
bitutil.BitIsSet(otherBuffer, i), bitutil.BitIsSet(bmCopy, i), "bit index: %d",
i)
+ }
+ for i := 0; i < int(copyLen); i++ {
+ assert.Equalf(t,
bitutil.BitIsSet(buffer, i+int(offset)), bitutil.BitIsSet(bmCopy,
i+int(destOffset)), "bit index: %d", i)
+ }
+ for i := int(destOffset + copyLen); i <
len(otherBuffer); i++ {
+ assert.Equalf(t,
bitutil.BitIsSet(otherBuffer, i), bitutil.BitIsSet(bmCopy, i), "bit index: %d",
i)
+ }
+ })
+ }
+ }
+ }
+}
+
+func benchmarkCopyBitmapN(b *testing.B, offsetSrc, offsetDest, n int) {
+ nbits := n * 8
+ // random bytes
+ r := rand.New(rand.NewSource(0))
+ src := make([]byte, n)
+ r.Read(src)
+
+ length := nbits - offsetSrc
+
+ dest := make([]byte, bitutil.BytesForBits(int64(length+offsetDest)))
+
+ b.ResetTimer()
+ b.SetBytes(int64(n))
+ for i := 0; i < b.N; i++ {
+ bitutil.CopyBitmap(src, offsetSrc, length, dest, offsetDest)
+ }
+}
+
+// Fast path which is just a memcopy
+func BenchmarkCopyBitmapWithoutOffset(b *testing.B) {
+ for _, sz := range []int{32, 128, 1000, 1024} {
+ b.Run(strconv.Itoa(sz), func(b *testing.B) {
+ benchmarkCopyBitmapN(b, 0, 0, sz)
+ })
+ }
+}
+
+// slow path where the source buffer is not byte aligned
+func BenchmarkCopyBitmapWithOffset(b *testing.B) {
+ for _, sz := range []int{32, 128, 1000, 1024} {
+ b.Run(strconv.Itoa(sz), func(b *testing.B) {
+ benchmarkCopyBitmapN(b, 4, 0, sz)
+ })
+ }
+}
+
+// slow path where both source and dest are not byte aligned
+func BenchmarkCopyBitmapWithOffsetBoth(b *testing.B) {
+ for _, sz := range []int{32, 128, 1000, 1024} {
+ b.Run(strconv.Itoa(sz), func(b *testing.B) {
+ benchmarkCopyBitmapN(b, 3, 7, sz)
+ })
+ }
+}
diff --git a/go/arrow/bitutil/bitutil.go b/go/arrow/bitutil/bitutil.go
index a751674..02a4170 100644
--- a/go/arrow/bitutil/bitutil.go
+++ b/go/arrow/bitutil/bitutil.go
@@ -17,9 +17,12 @@
package bitutil
import (
+ "math"
"math/bits"
"reflect"
"unsafe"
+
+ "github.com/apache/arrow/go/arrow/memory"
)
var (
@@ -157,3 +160,61 @@ func bytesToUint64(b []byte) []uint64 {
return res
}
+
+var (
+ // PrecedingBitmask is a convenience set of values as bitmasks for
checking
+ // prefix bits of a byte
+ PrecedingBitmask = [8]byte{0, 1, 3, 7, 15, 31, 63, 127}
+ // TrailingBitmask is the bitwise complement version of
kPrecedingBitmask
+ TrailingBitmask = [8]byte{255, 254, 252, 248, 240, 224, 192, 128}
+)
+
+// SetBitsTo is a convenience function to quickly set or unset all the bits
+// in a bitmap starting at startOffset for length bits.
+func SetBitsTo(bits []byte, startOffset, length int64, areSet bool) {
+ if length == 0 {
+ return
+ }
+
+ beg := startOffset
+ end := startOffset + length
+ var fill uint8 = 0
+ if areSet {
+ fill = math.MaxUint8
+ }
+
+ byteBeg := beg / 8
+ byteEnd := end/8 + 1
+
+ // don't modify bits before the startOffset by using this mask
+ firstByteMask := PrecedingBitmask[beg%8]
+ // don't modify bits past the length by using this mask
+ lastByteMask := TrailingBitmask[end%8]
+
+ if byteEnd == byteBeg+1 {
+ // set bits within a single byte
+ onlyByteMask := firstByteMask
+ if end%8 != 0 {
+ onlyByteMask = firstByteMask | lastByteMask
+ }
+
+ bits[byteBeg] &= onlyByteMask
+ bits[byteBeg] |= fill &^ onlyByteMask
+ return
+ }
+
+ // set/clear trailing bits of first byte
+ bits[byteBeg] &= firstByteMask
+ bits[byteBeg] |= fill &^ firstByteMask
+
+ if byteEnd-byteBeg > 2 {
+ memory.Set(bits[byteBeg+1:byteEnd-1], fill)
+ }
+
+ if end%8 == 0 {
+ return
+ }
+
+ bits[byteEnd-1] &= lastByteMask
+ bits[byteEnd-1] |= fill &^ lastByteMask
+}
diff --git a/go/arrow/bitutil/bitutil_test.go b/go/arrow/bitutil/bitutil_test.go
index fcb362f..0a4b725 100644
--- a/go/arrow/bitutil/bitutil_test.go
+++ b/go/arrow/bitutil/bitutil_test.go
@@ -197,6 +197,39 @@ func TestCountSetBitsOffset(t *testing.T) {
}
}
+func TestSetBitsTo(t *testing.T) {
+ for _, fillByte := range []byte{0x00, 0xFF} {
+ {
+ // set within a byte
+ bm := []byte{fillByte, fillByte, fillByte, fillByte}
+ bitutil.SetBitsTo(bm, 2, 2, true)
+ bitutil.SetBitsTo(bm, 4, 2, false)
+ assert.Equal(t, []byte{(fillByte &^ 0x3C) | 0xC},
bm[:1])
+ }
+ {
+ // test straddling a single byte boundary
+ bm := []byte{fillByte, fillByte, fillByte, fillByte}
+ bitutil.SetBitsTo(bm, 4, 7, true)
+ bitutil.SetBitsTo(bm, 11, 7, false)
+ assert.Equal(t, []byte{(fillByte & 0xF) | 0xF0, 0x7,
fillByte &^ 0x3}, bm[:3])
+ }
+ {
+ // test byte aligned end
+ bm := []byte{fillByte, fillByte, fillByte, fillByte}
+ bitutil.SetBitsTo(bm, 4, 4, true)
+ bitutil.SetBitsTo(bm, 8, 8, false)
+ assert.Equal(t, []byte{(fillByte & 0xF) | 0xF0, 0x00,
fillByte}, bm[:3])
+ }
+ {
+ // test byte aligned end, multiple bytes
+ bm := []byte{fillByte, fillByte, fillByte, fillByte}
+ bitutil.SetBitsTo(bm, 0, 24, false)
+ falseByte := byte(0)
+ assert.Equal(t, []byte{falseByte, falseByte, falseByte,
fillByte}, bm)
+ }
+ }
+}
+
func bbits(v ...int32) []byte {
return tools.IntsToBitsLSB(v...)
}
diff --git a/go/arrow/go.sum b/go/arrow/go.sum
index 0ac57ba..24da3ea 100644
--- a/go/arrow/go.sum
+++ b/go/arrow/go.sum
@@ -56,7 +56,6 @@ github.com/pmezard/go-difflib v1.0.0
h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZb
github.com/pmezard/go-difflib v1.0.0/go.mod
h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4/go.mod
h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA=
github.com/rogpeppe/fastuuid v1.2.0/go.mod
h1:jVj6XXZzXRy/MSR5jhDC/2q6DgLz+nrA6LYCDYWNEvQ=
-github.com/stretchr/objx v0.1.0 h1:4G4v2dO3VZwixGIRoQ5Lfboy6nUhCyYzaqnIAPPhYs4=
github.com/stretchr/objx v0.1.0/go.mod
h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
github.com/stretchr/testify v1.5.1/go.mod
h1:5W2xD1RspED5o8YsWQXVCued0rvSQ+mT+I5cxcmMvtA=
github.com/stretchr/testify v1.7.0
h1:nwc3DEeHmmLAfoZucVR881uASk0Mfjw8xYJ99tb5CcY=
@@ -70,10 +69,8 @@ golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod
h1:CJ0aWSM057203Lf6IL
golang.org/x/lint v0.0.0-20181026193005-c67002cb31c3/go.mod
h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE=
golang.org/x/lint v0.0.0-20190227174305-5b3e6a55c961/go.mod
h1:wehouNa3lNwaWXcvxsM5YxQ5yQlVC4a0KAMCusXpPoU=
golang.org/x/lint v0.0.0-20190313153728-d0100b6bd8b3/go.mod
h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc=
-golang.org/x/lint v0.0.0-20210508222113-6edffad5e616
h1:VLliZ0d+/avPrXXH+OakdXhpJuEoBZuwh1m2j7U6Iug=
golang.org/x/lint v0.0.0-20210508222113-6edffad5e616/go.mod
h1:3xt1FjdF8hUf6vQPIChWIBhFzV8gjjsPE/fR3IyQdNY=
golang.org/x/mod v0.1.1-0.20191105210325-c90efee705ee/go.mod
h1:QqPTAvyqsEbceGzBzNggFXnrqF1CaUcvgkdR5Ot7KZg=
-golang.org/x/mod v0.4.2 h1:Gz96sIWK3OalVv/I/qNygP42zyoKp3xptRVCWRFEBvo=
golang.org/x/mod v0.4.2/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
golang.org/x/net v0.0.0-20180724234803-3673e40ba225/go.mod
h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
golang.org/x/net v0.0.0-20180826012351-8a410e7b638d/go.mod
h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
@@ -116,7 +113,6 @@ golang.org/x/tools
v0.0.0-20190311212946-11955173bddd/go.mod h1:LCzVGOaR6xXOjkQ3
golang.org/x/tools v0.0.0-20190524140312-2c0ae7006135/go.mod
h1:RgjU9mgBXZiqYHBnxXauZ1Gv1EHHAz9KjViQ78xBX0Q=
golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod
h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
golang.org/x/tools v0.0.0-20200130002326-2f3ba24bd6e7/go.mod
h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28=
-golang.org/x/tools v0.1.4 h1:cVngSRcfgyZCzys3KYOpCFa+4dqX/Oub9tAq00ttGVs=
golang.org/x/tools v0.1.4/go.mod
h1:o0xws9oXOQQZyjljx8fwUC0k7L1pTE6eaCbjGeHmOkk=
golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod
h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod
h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=