zeroshade commented on code in PR #1053:
URL: https://github.com/apache/iceberg-go/pull/1053#discussion_r3275019424
##########
table/dv/roaring_bitmap.go:
##########
@@ -160,7 +161,59 @@ func DeserializeRoaringPositionBitmap(data []byte)
(*RoaringPositionBitmap, erro
return b, nil
}
-// sortedKeys returns the bitmap keys in ascending order.
-func (b *RoaringPositionBitmap) sortedKeys() []uint32 {
- return slices.Sorted(maps.Keys(b.bitmaps))
+// KeepMaskBytes returns a bit-packed []byte of length ⌈length/8⌉ where bit i
+// (LSB-first within a byte) is 1 iff position i is NOT in the bitmap. The
+// layout matches Arrow Boolean buffer convention so callers can wrap the
+// result via memory.NewBufferBytes / array.NewBoolean without re-packing.
+//
+// length bounds the range of positions the caller cares about — typically the
+// data file's row count. Bits past length-1 in the final byte are cleared.
+// Positions in the bitmap that fall outside [0, length) are ignored, so a
+// caller can safely pass a length smaller than the bitmap's max position
+// (e.g. when the file row count is below a stale upper bound).
+//
+// Bucket-key arithmetic is exact: each 32-bit bucket covers exactly 2^32
+// positions, i.e. 2^29 bytes, so per-bucket byte offsets are always
+// 8-byte-aligned and the inverted dense words land cleanly without bit shifts.
+func (b *RoaringPositionBitmap) KeepMaskBytes(length int64) []byte {
+ if length <= 0 {
+ return nil
+ }
+ out := make([]byte, (length+7)/8)
+ for i := range out {
+ out[i] = 0xFF
+ }
+ // Clear bits past length-1 in the trailing byte so the mask is exactly
+ // `length` bits wide — Arrow's Boolean array honors the length field
but
+ // callers inspecting the raw bytes shouldn't see phantom keep bits.
+ if extra := length & 7; extra != 0 {
+ out[len(out)-1] &= byte(1<<extra - 1)
+ }
Review Comment:
use
https://pkg.go.dev/github.com/apache/arrow-go/[email protected]/arrow/bitutil#SetBitsTo
?
##########
table/dv/roaring_bitmap.go:
##########
@@ -160,7 +161,59 @@ func DeserializeRoaringPositionBitmap(data []byte)
(*RoaringPositionBitmap, erro
return b, nil
}
-// sortedKeys returns the bitmap keys in ascending order.
-func (b *RoaringPositionBitmap) sortedKeys() []uint32 {
- return slices.Sorted(maps.Keys(b.bitmaps))
+// KeepMaskBytes returns a bit-packed []byte of length ⌈length/8⌉ where bit i
+// (LSB-first within a byte) is 1 iff position i is NOT in the bitmap. The
+// layout matches Arrow Boolean buffer convention so callers can wrap the
+// result via memory.NewBufferBytes / array.NewBoolean without re-packing.
+//
+// length bounds the range of positions the caller cares about — typically the
+// data file's row count. Bits past length-1 in the final byte are cleared.
+// Positions in the bitmap that fall outside [0, length) are ignored, so a
+// caller can safely pass a length smaller than the bitmap's max position
+// (e.g. when the file row count is below a stale upper bound).
+//
+// Bucket-key arithmetic is exact: each 32-bit bucket covers exactly 2^32
+// positions, i.e. 2^29 bytes, so per-bucket byte offsets are always
+// 8-byte-aligned and the inverted dense words land cleanly without bit shifts.
+func (b *RoaringPositionBitmap) KeepMaskBytes(length int64) []byte {
+ if length <= 0 {
+ return nil
+ }
+ out := make([]byte, (length+7)/8)
Review Comment:
use
https://pkg.go.dev/github.com/apache/arrow-go/[email protected]/arrow/bitutil#BytesForBits
:smile:
##########
table/dv/roaring_bitmap.go:
##########
@@ -160,7 +161,59 @@ func DeserializeRoaringPositionBitmap(data []byte)
(*RoaringPositionBitmap, erro
return b, nil
}
-// sortedKeys returns the bitmap keys in ascending order.
-func (b *RoaringPositionBitmap) sortedKeys() []uint32 {
- return slices.Sorted(maps.Keys(b.bitmaps))
+// KeepMaskBytes returns a bit-packed []byte of length ⌈length/8⌉ where bit i
+// (LSB-first within a byte) is 1 iff position i is NOT in the bitmap. The
+// layout matches Arrow Boolean buffer convention so callers can wrap the
+// result via memory.NewBufferBytes / array.NewBoolean without re-packing.
+//
+// length bounds the range of positions the caller cares about — typically the
+// data file's row count. Bits past length-1 in the final byte are cleared.
+// Positions in the bitmap that fall outside [0, length) are ignored, so a
+// caller can safely pass a length smaller than the bitmap's max position
+// (e.g. when the file row count is below a stale upper bound).
+//
+// Bucket-key arithmetic is exact: each 32-bit bucket covers exactly 2^32
+// positions, i.e. 2^29 bytes, so per-bucket byte offsets are always
+// 8-byte-aligned and the inverted dense words land cleanly without bit shifts.
+func (b *RoaringPositionBitmap) KeepMaskBytes(length int64) []byte {
+ if length <= 0 {
+ return nil
+ }
+ out := make([]byte, (length+7)/8)
+ for i := range out {
+ out[i] = 0xFF
+ }
Review Comment:
use
https://pkg.go.dev/github.com/apache/arrow-go/[email protected]/arrow/memory#Set
which will leverage SIMD for this :smile:
##########
table/dv/roaring_bitmap.go:
##########
@@ -160,7 +161,59 @@ func DeserializeRoaringPositionBitmap(data []byte)
(*RoaringPositionBitmap, erro
return b, nil
}
-// sortedKeys returns the bitmap keys in ascending order.
-func (b *RoaringPositionBitmap) sortedKeys() []uint32 {
- return slices.Sorted(maps.Keys(b.bitmaps))
+// KeepMaskBytes returns a bit-packed []byte of length ⌈length/8⌉ where bit i
+// (LSB-first within a byte) is 1 iff position i is NOT in the bitmap. The
+// layout matches Arrow Boolean buffer convention so callers can wrap the
+// result via memory.NewBufferBytes / array.NewBoolean without re-packing.
+//
+// length bounds the range of positions the caller cares about — typically the
+// data file's row count. Bits past length-1 in the final byte are cleared.
+// Positions in the bitmap that fall outside [0, length) are ignored, so a
+// caller can safely pass a length smaller than the bitmap's max position
+// (e.g. when the file row count is below a stale upper bound).
+//
+// Bucket-key arithmetic is exact: each 32-bit bucket covers exactly 2^32
+// positions, i.e. 2^29 bytes, so per-bucket byte offsets are always
+// 8-byte-aligned and the inverted dense words land cleanly without bit shifts.
+func (b *RoaringPositionBitmap) KeepMaskBytes(length int64) []byte {
+ if length <= 0 {
+ return nil
+ }
+ out := make([]byte, (length+7)/8)
+ for i := range out {
+ out[i] = 0xFF
+ }
+ // Clear bits past length-1 in the trailing byte so the mask is exactly
+ // `length` bits wide — Arrow's Boolean array honors the length field
but
+ // callers inspecting the raw bytes shouldn't see phantom keep bits.
+ if extra := length & 7; extra != 0 {
+ out[len(out)-1] &= byte(1<<extra - 1)
+ }
+
+ for key, bm := range b.bitmaps {
+ bucketByteBase := int64(key) << 29 // (key << 32) bits / 8
+ if bucketByteBase >= int64(len(out)) {
+ continue
+ }
+ dense := bm.ToDense() // []uint64, LE bit-packed
Review Comment:
is it actually LE bit-packed? The docs of the roaring bitmap package doesn't
say that, so I would assume it's native-endian
##########
table/dv/roaring_bitmap.go:
##########
@@ -160,7 +161,59 @@ func DeserializeRoaringPositionBitmap(data []byte)
(*RoaringPositionBitmap, erro
return b, nil
}
-// sortedKeys returns the bitmap keys in ascending order.
-func (b *RoaringPositionBitmap) sortedKeys() []uint32 {
- return slices.Sorted(maps.Keys(b.bitmaps))
+// KeepMaskBytes returns a bit-packed []byte of length ⌈length/8⌉ where bit i
+// (LSB-first within a byte) is 1 iff position i is NOT in the bitmap. The
+// layout matches Arrow Boolean buffer convention so callers can wrap the
+// result via memory.NewBufferBytes / array.NewBoolean without re-packing.
+//
+// length bounds the range of positions the caller cares about — typically the
+// data file's row count. Bits past length-1 in the final byte are cleared.
+// Positions in the bitmap that fall outside [0, length) are ignored, so a
+// caller can safely pass a length smaller than the bitmap's max position
+// (e.g. when the file row count is below a stale upper bound).
+//
+// Bucket-key arithmetic is exact: each 32-bit bucket covers exactly 2^32
+// positions, i.e. 2^29 bytes, so per-bucket byte offsets are always
+// 8-byte-aligned and the inverted dense words land cleanly without bit shifts.
+func (b *RoaringPositionBitmap) KeepMaskBytes(length int64) []byte {
+ if length <= 0 {
+ return nil
+ }
+ out := make([]byte, (length+7)/8)
+ for i := range out {
+ out[i] = 0xFF
+ }
+ // Clear bits past length-1 in the trailing byte so the mask is exactly
+ // `length` bits wide — Arrow's Boolean array honors the length field
but
+ // callers inspecting the raw bytes shouldn't see phantom keep bits.
+ if extra := length & 7; extra != 0 {
+ out[len(out)-1] &= byte(1<<extra - 1)
+ }
+
+ for key, bm := range b.bitmaps {
+ bucketByteBase := int64(key) << 29 // (key << 32) bits / 8
+ if bucketByteBase >= int64(len(out)) {
+ continue
+ }
+ dense := bm.ToDense() // []uint64, LE bit-packed
+ for i, word := range dense {
Review Comment:
Could this be simplified by using
https://github.com/apache/arrow-go/blob/v18.6.0/arrow/bitutil/bitmaps.go#L330
and `PutNextWord(^word)` ?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]