This is an automated email from the ASF dual-hosted git repository.
zeroshade pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/iceberg-go.git
The following commit(s) were added to refs/heads/main by this push:
new 774c8add feat: Deletion vector reader (#866)
774c8add is described below
commit 774c8addea96522897e161216ebd456c96165406
Author: Shreyas Mishra <[email protected]>
AuthorDate: Sat May 9 01:30:31 2026 +0530
feat: Deletion vector reader (#866)
Signed-off-by: Shreyas220 <[email protected]>
---
go.mod | 3 +
go.sum | 6 +
manifest.go | 16 +-
table/dv/deletion_vector.go | 132 +++++++
table/dv/deletion_vector_test.go | 382 +++++++++++++++++++++
table/dv/roaring_bitmap.go | 166 +++++++++
table/dv/roaring_bitmap_test.go | 219 ++++++++++++
table/dv/testdata/deletes/64map32bitvals.bin | Bin 0 -> 48 bytes
table/dv/testdata/deletes/64mapempty.bin | Bin 0 -> 8 bytes
table/dv/testdata/deletes/64mapspreadvals.bin | Bin 0 -> 408 bytes
table/dv/testdata/deletes/README.md | 21 ++
.../deletes/all-container-types-position-index.bin | Bin 0 -> 94 bytes
table/dv/testdata/deletes/empty-position-index.bin | Bin 0 -> 20 bytes
.../small-alternating-values-position-index.bin | Bin 0 -> 50 bytes
.../small-and-large-values-position-index.bin | Bin 0 -> 56 bytes
table/dv_scan_planning_test.go | 3 +-
16 files changed, 944 insertions(+), 4 deletions(-)
diff --git a/go.mod b/go.mod
index ccbcc823..1f94eaf8 100644
--- a/go.mod
+++ b/go.mod
@@ -23,6 +23,7 @@ require (
cloud.google.com/go/storage v1.62.1
github.com/Azure/azure-sdk-for-go/sdk/azidentity v1.13.1
github.com/Azure/azure-sdk-for-go/sdk/storage/azblob v1.6.4
+ github.com/RoaringBitmap/roaring/v2 v2.16.0
github.com/alexflint/go-arg v1.6.1
github.com/apache/arrow-go/v18 v18.6.0
github.com/aws/aws-sdk-go-v2 v1.41.6
@@ -104,6 +105,7 @@ require (
github.com/aws/aws-sdk-go-v2/service/sts v1.42.0 // indirect
github.com/beltran/gosasl v1.0.0 // indirect
github.com/beltran/gssapi v0.0.0-20200324152954-d86554db4bab // indirect
+ github.com/bits-and-blooms/bitset v1.24.2 // indirect
github.com/buger/goterm v1.0.4 // indirect
github.com/cenkalti/backoff/v4 v4.3.0 // indirect
github.com/cenkalti/backoff/v5 v5.0.3 // indirect
@@ -197,6 +199,7 @@ require (
github.com/moby/sys/userns v0.1.0 // indirect
github.com/moby/term v0.5.2 // indirect
github.com/morikuni/aec v1.1.0 // indirect
+ github.com/mschoch/smat v0.2.0 // indirect
github.com/ncruces/go-strftime v1.0.0 // indirect
github.com/opencontainers/go-digest v1.0.0 // indirect
github.com/opencontainers/image-spec v1.1.1 // indirect
diff --git a/go.sum b/go.sum
index ecc67860..7735421d 100644
--- a/go.sum
+++ b/go.sum
@@ -81,6 +81,8 @@ github.com/Microsoft/hcsshim v0.14.0-rc.1
h1:qAPXKwGOkVn8LlqgBN8GS0bxZ83hOJpcjxz
github.com/Microsoft/hcsshim v0.14.0-rc.1/go.mod
h1:hTKFGbnDtQb1wHiOWv4v0eN+7boSWAHyK/tNAaYZL0c=
github.com/ProtonMail/go-crypto v1.3.0
h1:ILq8+Sf5If5DCpHQp4PbZdS1J7HDFRXz/+xKBiRGFrw=
github.com/ProtonMail/go-crypto v1.3.0/go.mod
h1:9whxjD8Rbs29b4XWbB8irEcE8KHMqaR2e7GWU1R+/PE=
+github.com/RoaringBitmap/roaring/v2 v2.16.0
h1:Kys1UNf49d5W8Tq3bpuAhIr/Z8/yPB+59CO8A6c/BbE=
+github.com/RoaringBitmap/roaring/v2 v2.16.0/go.mod
h1:eq4wdNXxtJIS/oikeCzdX1rBzek7ANzbth041hrU8Q4=
github.com/acarl005/stripansi v0.0.0-20180116102854-5a71ef0e047d
h1:licZJFw2RwpHMqeKTCYkitsPqHNxTmd4SNR5r94FGM8=
github.com/acarl005/stripansi v0.0.0-20180116102854-5a71ef0e047d/go.mod
h1:asat636LX7Bqt5lYEZ27JNDcqxfjdBQuJ/MM4CN/Lzo=
github.com/alexflint/go-arg v1.6.1
h1:uZogJ6VDBjcuosydKgvYYRhh9sRCusjOvoOLZopBlnA=
@@ -150,6 +152,8 @@ github.com/beltran/gssapi
v0.0.0-20200324152954-d86554db4bab h1:ayfcn60tXOSYy5zU
github.com/beltran/gssapi v0.0.0-20200324152954-d86554db4bab/go.mod
h1:GLe4UoSyvJ3cVG+DVtKen5eAiaD8mAJFuV5PT3Eeg9Q=
github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM=
github.com/beorn7/perks v1.0.1/go.mod
h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw=
+github.com/bits-and-blooms/bitset v1.24.2
h1:M7/NzVbsytmtfHbumG+K2bremQPMJuqv1JD3vOaFxp0=
+github.com/bits-and-blooms/bitset v1.24.2/go.mod
h1:7hO7Gc7Pp1vODcmWvKMRA9BNmbv6a/7QIWpPxHddWR8=
github.com/blang/semver v3.5.1+incompatible
h1:cQNTCjp13qL8KC3Nbxr/y2Bqb63oX6wdnnjpJbkM4JQ=
github.com/blang/semver v3.5.1+incompatible/go.mod
h1:kRBLl5iJ+tD4TcOOxsy/0fnwebNt5EWlYSAyrTnjyyk=
github.com/buger/goterm v1.0.4 h1:Z9YvGmOih81P0FbVtEYTFF6YsSgxSUKEhf/f9bTMXbY=
@@ -475,6 +479,8 @@ github.com/moby/term v0.5.2
h1:6qk3FJAFDs6i/q3W/pQ97SX192qKfZgGjCQqfCJkgzQ=
github.com/moby/term v0.5.2/go.mod
h1:d3djjFCrjnB+fl8NJux+EJzu0msscUP+f8it8hPkFLc=
github.com/morikuni/aec v1.1.0 h1:vBBl0pUnvi/Je71dsRrhMBtreIqNMYErSAbEeb8jrXQ=
github.com/morikuni/aec v1.1.0/go.mod
h1:xDRgiq/iw5l+zkao76YTKzKttOp2cwPEne25HDkJnBw=
+github.com/mschoch/smat v0.2.0 h1:8imxQsjDm8yFEAVBe7azKmKSgzSkZXDuKkSq9374khM=
+github.com/mschoch/smat v0.2.0/go.mod
h1:kc9mz7DoBKqDyiRL7VZN8KvXQMWeTaVnttLRXOlotKw=
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822
h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA=
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod
h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ=
github.com/ncruces/go-strftime v1.0.0
h1:HMFp8mLCTPp341M/ZnA4qaf7ZlsbTc+miZjCLOFAw7w=
diff --git a/manifest.go b/manifest.go
index 3de9c128..2efefe8a 100644
--- a/manifest.go
+++ b/manifest.go
@@ -1646,6 +1646,7 @@ const (
AvroFile FileFormat = "AVRO"
OrcFile FileFormat = "ORC"
ParquetFile FileFormat = "PARQUET"
+ PuffinFile FileFormat = "PUFFIN"
)
// FileFormatFromString parses a file format string (case-insensitive).
@@ -1657,6 +1658,8 @@ func FileFormatFromString(s string) (FileFormat, error) {
return OrcFile, nil
case string(AvroFile):
return AvroFile, nil
+ case string(PuffinFile):
+ return PuffinFile, nil
default:
return "", fmt.Errorf("unknown file format: %s", s)
}
@@ -2152,10 +2155,17 @@ func NewDataFileBuilder(
return nil, fmt.Errorf("%w: path cannot be empty",
ErrInvalidArgument)
}
- if format != AvroFile && format != OrcFile && format != ParquetFile {
+ if format != AvroFile && format != OrcFile && format != ParquetFile &&
format != PuffinFile {
return nil, fmt.Errorf(
- "%w: format must be one of %s, %s, or %s",
- ErrInvalidArgument, AvroFile, OrcFile, ParquetFile,
+ "%w: format must be one of %s, %s, %s, or %s",
+ ErrInvalidArgument, AvroFile, OrcFile, ParquetFile,
PuffinFile,
+ )
+ }
+
+ if format == PuffinFile && content != EntryContentPosDeletes {
+ return nil, fmt.Errorf(
+ "%w: %s format is only valid for %s content",
+ ErrInvalidArgument, PuffinFile, EntryContentPosDeletes,
)
}
diff --git a/table/dv/deletion_vector.go b/table/dv/deletion_vector.go
new file mode 100644
index 00000000..2b618c89
--- /dev/null
+++ b/table/dv/deletion_vector.go
@@ -0,0 +1,132 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+package dv
+
+import (
+ "encoding/binary"
+ "fmt"
+ "hash/crc32"
+
+ "github.com/apache/iceberg-go"
+ iceio "github.com/apache/iceberg-go/io"
+ "github.com/apache/iceberg-go/puffin"
+)
+
+const (
+ // DVMagicNumber is the magic number for deletion vectors.
+ // Spec bytes: D1 D3 39 64 (big-endian) = 0x6439D3D1 (little-endian
uint32)
+ DVMagicNumber uint32 = 0x6439D3D1
+
+ dvLengthSize = 4 // length field
+ dvMagicSize = 4 // magic field
+ dvCRCSize = 4 // CRC-32 checksum
+ dvMinSize = dvLengthSize + dvMagicSize + dvCRCSize
+)
+
+// DeserializeDV parses a deletion vector blob and returns a bitmap of deleted
positions.
+//
+// The DV binary format is:
+// - Length (4 bytes, big-endian): size of magic + bitmap data, excluding
CRC-32
+// - Magic (4 bytes, little-endian): must be 0x6439D3D1
+// - Bitmap (variable): roaring bitmap in Iceberg portable format
+// - CRC-32 (4 bytes, big-endian): checksum over magic + bitmap
+//
+// If expectedCardinality >= 0, the bitmap's cardinality is validated against
it.
+func DeserializeDV(data []byte, expectedCardinality int64)
(*RoaringPositionBitmap, error) {
+ if len(data) < dvMinSize {
+ return nil, fmt.Errorf("deletion vector payload too short: %d
bytes (minimum %d)", len(data), dvMinSize)
+ }
+
+ // 1. Read and validate length
+ length := binary.BigEndian.Uint32(data[0:dvLengthSize])
+ expectedLength := uint32(len(data) - dvLengthSize - dvCRCSize)
+ if length != expectedLength {
+ return nil, fmt.Errorf("deletion vector length mismatch: got
%d, expected %d", length, expectedLength)
+ }
+
+ // 2. Read and validate magic
+ magic := binary.LittleEndian.Uint32(data[dvLengthSize :
dvLengthSize+dvMagicSize])
+ if magic != DVMagicNumber {
+ return nil, fmt.Errorf("invalid deletion vector magic: 0x%08x,
expected 0x%08x", magic, DVMagicNumber)
+ }
+
+ // 3. Verify CRC-32 over magic + bitmap (bytes 4 to len-4)
+ bitmapDataStart := dvLengthSize
+ bitmapDataEnd := len(data) - dvCRCSize
+ computedCRC := crc32.ChecksumIEEE(data[bitmapDataStart:bitmapDataEnd])
+ expectedCRC := binary.BigEndian.Uint32(data[bitmapDataEnd:])
+ if computedCRC != expectedCRC {
+ return nil, fmt.Errorf("deletion vector CRC mismatch: computed
0x%08x, expected 0x%08x", computedCRC, expectedCRC)
+ }
+
+ // 4. Deserialize roaring bitmap from the inner bytes (after length +
magic, before CRC)
+ roaringStart := dvLengthSize + dvMagicSize
+ bitmap, err :=
DeserializeRoaringPositionBitmap(data[roaringStart:bitmapDataEnd])
+ if err != nil {
+ return nil, fmt.Errorf("deserialize deletion vector bitmap:
%w", err)
+ }
+
+ // 5. Validate cardinality if requested
+ if expectedCardinality >= 0 {
+ actual := bitmap.Cardinality()
+ if actual != expectedCardinality {
+ return nil, fmt.Errorf("deletion vector cardinality
mismatch: got %d, expected %d", actual, expectedCardinality)
+ }
+ }
+
+ return bitmap, nil
+}
+
+// ReadDV reads a deletion vector from a puffin file using the manifest entry
metadata.
+// ContentOffset and ContentSizeInBytes must be set on the DataFile (required
by v3 spec).
+func ReadDV(fs iceio.IO, dvFile iceberg.DataFile) (*RoaringPositionBitmap,
error) {
+ if dvFile.FileFormat() != iceberg.PuffinFile {
+ return nil, fmt.Errorf("expected PUFFIN format for deletion
vector, got %s", dvFile.FileFormat())
+ }
+
+ if dvFile.ContentOffset() == nil || dvFile.ContentSizeInBytes() == nil {
+ return nil, fmt.Errorf("DV file %s missing
ContentOffset/ContentSizeInBytes", dvFile.FilePath())
+ }
+
+ size := *dvFile.ContentSizeInBytes()
+ if size < 0 || size > int64(puffin.DefaultMaxBlobSize) {
+ return nil, fmt.Errorf("DV blob size %d out of valid range [0,
%d]", size, puffin.DefaultMaxBlobSize)
+ }
+
+ f, err := fs.Open(dvFile.FilePath())
+ if err != nil {
+ return nil, fmt.Errorf("open DV file %s: %w",
dvFile.FilePath(), err)
+ }
+ defer f.Close()
+
+ reader, err := puffin.NewReader(f)
+ if err != nil {
+ return nil, fmt.Errorf("create puffin reader for %s: %w",
dvFile.FilePath(), err)
+ }
+
+ offset := *dvFile.ContentOffset()
+ blobData := make([]byte, size)
+ if _, err := reader.ReadAt(blobData, offset); err != nil {
+ return nil, fmt.Errorf("read DV blob at offset %d: %w", offset,
err)
+ }
+
+ // Pass -1 to skip cardinality validation during deserialization.
+ // dvFile.Count() defaults to 0 when unset, which would incorrectly
+ // reject valid DVs. Callers can validate cardinality separately.
+ return DeserializeDV(blobData, -1)
+}
diff --git a/table/dv/deletion_vector_test.go b/table/dv/deletion_vector_test.go
new file mode 100644
index 00000000..1ea87142
--- /dev/null
+++ b/table/dv/deletion_vector_test.go
@@ -0,0 +1,382 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+package dv
+
+import (
+ "encoding/binary"
+ "errors"
+ "hash/crc32"
+ "os"
+ "path/filepath"
+ "testing"
+
+ "github.com/apache/iceberg-go"
+ iceio "github.com/apache/iceberg-go/io"
+ "github.com/apache/iceberg-go/puffin"
+ "github.com/stretchr/testify/assert"
+ "github.com/stretchr/testify/require"
+)
+
+// mockDVFile is a minimal iceberg.DataFile implementation with only the
+// fields exercised by the DV reader: path, format, count, and the DV-specific
+// offset/size/referenced-data-file pointers.
+type mockDVFile struct {
+ path string
+ format iceberg.FileFormat
+ count int64
+ referencedDataFile *string
+ contentOffset *int64
+ contentSizeInBytes *int64
+}
+
+func (m *mockDVFile) FilePath() string { return m.path }
+func (m *mockDVFile) FileFormat() iceberg.FileFormat { return
m.format }
+func (m *mockDVFile) Count() int64 { return m.count
}
+func (m *mockDVFile) ReferencedDataFile() *string { return
m.referencedDataFile }
+func (m *mockDVFile) ContentOffset() *int64 { return
m.contentOffset }
+func (m *mockDVFile) ContentSizeInBytes() *int64 { return
m.contentSizeInBytes }
+func (*mockDVFile) ContentType() iceberg.ManifestEntryContent { return
iceberg.EntryContentPosDeletes }
+func (*mockDVFile) Partition() map[int]any { return nil }
+func (*mockDVFile) FileSizeBytes() int64 { return 0 }
+func (*mockDVFile) ColumnSizes() map[int]int64 { return nil }
+func (*mockDVFile) ValueCounts() map[int]int64 { return nil }
+func (*mockDVFile) NullValueCounts() map[int]int64 { return nil }
+func (*mockDVFile) NaNValueCounts() map[int]int64 { return nil }
+func (*mockDVFile) DistinctValueCounts() map[int]int64 { return nil }
+func (*mockDVFile) LowerBoundValues() map[int][]byte { return nil }
+func (*mockDVFile) UpperBoundValues() map[int][]byte { return nil }
+func (*mockDVFile) KeyMetadata() []byte { return nil }
+func (*mockDVFile) SplitOffsets() []int64 { return nil }
+func (*mockDVFile) EqualityFieldIDs() []int { return nil }
+func (*mockDVFile) SortOrderID() *int { return nil }
+func (*mockDVFile) SpecID() int32 { return 0 }
+func (*mockDVFile) FirstRowID() *int64 { return nil }
+
+func strPtr(s string) *string { return &s }
+
+func readDVTestData(t *testing.T, name string) []byte {
+ t.Helper()
+ data, err := os.ReadFile(filepath.Join("testdata", "deletes", name))
+ require.NoError(t, err)
+
+ return data
+}
+
+func wrapDVPayloadForTest(bitmapBytes []byte) []byte {
+ bitmapDataLength := dvMagicSize + len(bitmapBytes)
+ totalSize := dvLengthSize + bitmapDataLength + dvCRCSize
+ out := make([]byte, totalSize)
+
+ binary.BigEndian.PutUint32(out[0:dvLengthSize],
uint32(bitmapDataLength))
+
binary.LittleEndian.PutUint32(out[dvLengthSize:dvLengthSize+dvMagicSize],
DVMagicNumber)
+ copy(out[dvLengthSize+dvMagicSize:], bitmapBytes)
+
+ crc := crc32.ChecksumIEEE(out[dvLengthSize : totalSize-dvCRCSize])
+ binary.BigEndian.PutUint32(out[totalSize-dvCRCSize:], crc)
+
+ return out
+}
+
+func writePuffinWithDVBlob(t *testing.T, dir string, dvBlobBytes []byte)
(string, puffin.BlobMetadata) {
+ t.Helper()
+
+ path := filepath.Join(dir, "test-dv.puffin")
+ f, err := os.Create(path)
+ require.NoError(t, err)
+ defer f.Close()
+
+ w, err := puffin.NewWriter(f)
+ require.NoError(t, err)
+
+ meta, err := w.AddBlob(puffin.BlobMetadataInput{
+ Type: puffin.BlobTypeDeletionVector,
+ SnapshotID: -1,
+ SequenceNumber: -1,
+ Fields: []int32{2147483546},
+ Properties: map[string]string{
+ "referenced-data-file":
"s3://bucket/data/data-001.parquet",
+ },
+ }, dvBlobBytes)
+ require.NoError(t, err)
+ require.NoError(t, w.Finish())
+
+ return path, meta
+}
+
+func newDVTestFile(path string, count int64, offset, size *int64) *mockDVFile {
+ return &mockDVFile{
+ path: path,
+ format: iceberg.PuffinFile,
+ count: count,
+ referencedDataFile: strPtr("s3://bucket/data/data-001.parquet"),
+ contentOffset: offset,
+ contentSizeInBytes: size,
+ }
+}
+
+type failingOpenFS struct {
+ err error
+}
+
+func (f failingOpenFS) Open(string) (iceio.File, error) { return nil, f.err }
+func (f failingOpenFS) Remove(string) error { return nil }
+
+// Why: proves the DV envelope can successfully decode a representative valid
blob.
+// Condition: Java-produced DV with deleted positions [1, 3, 5, 7, 9] and
cardinality validation disabled.
+// Assertion: no error, cardinality is 5, expected positions are present, and
an adjacent unset position is absent.
+func TestDeserializeDV(t *testing.T) {
+ data := readDVTestData(t, "small-alternating-values-position-index.bin")
+
+ bm, err := DeserializeDV(data, -1)
+ require.NoError(t, err)
+
+ assert.Equal(t, int64(5), bm.Cardinality())
+ assert.True(t, bm.Contains(1))
+ assert.True(t, bm.Contains(9))
+ assert.False(t, bm.Contains(2))
+}
+
+// Why: validates the DV envelope for the empty case — zero deleted positions.
+// Condition: Java-produced DV with no positions.
+// Assertion: no error, bitmap is empty.
+func TestDeserializeDVEmpty(t *testing.T) {
+ data := readDVTestData(t, "empty-position-index.bin")
+
+ bm, err := DeserializeDV(data, -1)
+ require.NoError(t, err)
+
+ assert.True(t, bm.IsEmpty())
+ assert.Equal(t, int64(0), bm.Cardinality())
+}
+
+// Why: exercises all three roaring container types (array, run, bitmap)
through the DV envelope,
+// which is the main cross-library bug source.
+// Condition: Java-produced DV with 132,561 positions across 2 keys (0 and 1),
each with 3 containers.
+// Assertion: cardinality matches, representative positions from each
container type are present.
+func TestDeserializeDVAllContainerTypes(t *testing.T) {
+ data := readDVTestData(t, "all-container-types-position-index.bin")
+
+ bm, err := DeserializeDV(data, -1)
+ require.NoError(t, err)
+
+ assert.Equal(t, int64(132561), bm.Cardinality())
+
+ // Key 0, array container: positions 5 and 7
+ assert.True(t, bm.Contains(5))
+ assert.True(t, bm.Contains(7))
+ assert.False(t, bm.Contains(6))
+
+ // Key 0, run container: positions 65537..66535 (container 1, 999
values)
+ assert.True(t, bm.Contains(65537))
+ assert.True(t, bm.Contains(66535))
+ assert.False(t, bm.Contains(65536))
+ assert.False(t, bm.Contains(66536))
+
+ // Key 0, bitmap container: positions 131073..196606 (container 2,
65534 values)
+ assert.True(t, bm.Contains(131073))
+ assert.True(t, bm.Contains(196606))
+ assert.False(t, bm.Contains(131072))
+
+ // Key 1, array container: positions (1<<32)|10 and (1<<32)|20
+ assert.True(t, bm.Contains((uint64(1)<<32)|10))
+ assert.True(t, bm.Contains((uint64(1)<<32)|20))
+
+ // Key 1, run container: starts at (1<<32)|65546
+ assert.True(t, bm.Contains((uint64(1)<<32)|65546))
+
+ // Key 1, bitmap container: starts at (1<<32)|131073
+ assert.True(t, bm.Contains((uint64(1)<<32)|131073))
+}
+
+// Why: validates DV decoding with values that span both small and large
32-bit ranges in a single key.
+// Condition: Java-produced DV with positions [100, 101, 2147483747,
2147483748].
+// Assertion: cardinality is 4, all positions present, adjacent unset
positions absent.
+func TestDeserializeDVSmallAndLargeValues(t *testing.T) {
+ data := readDVTestData(t, "small-and-large-values-position-index.bin")
+
+ bm, err := DeserializeDV(data, -1)
+ require.NoError(t, err)
+
+ assert.Equal(t, int64(4), bm.Cardinality())
+ assert.True(t, bm.Contains(100))
+ assert.True(t, bm.Contains(101))
+ assert.True(t, bm.Contains(2147483747)) // Integer.MAX_VALUE + 100
+ assert.True(t, bm.Contains(2147483748)) // Integer.MAX_VALUE + 101
+ assert.False(t, bm.Contains(99))
+ assert.False(t, bm.Contains(102))
+}
+
+// Why: truncated payloads should fail cleanly before any slicing or decoding
happens.
+// Condition: fewer than the minimum 12 bytes required for length, magic, and
CRC.
+// Assertion: returns an error containing "too short".
+func TestDeserializeDVTooShort(t *testing.T) {
+ _, err := DeserializeDV([]byte{0, 1, 2}, -1)
+ assert.ErrorContains(t, err, "too short")
+}
+
+// Why: the DV envelope owns the outer length field and must reject mismatches.
+// Condition: payload has only the minimum bytes, but the encoded length says
99.
+// Assertion: returns an error containing "length mismatch".
+func TestDeserializeDVBadLength(t *testing.T) {
+ data := make([]byte, dvMinSize)
+ binary.BigEndian.PutUint32(data[0:dvLengthSize], 99)
+
+ _, err := DeserializeDV(data, -1)
+ assert.ErrorContains(t, err, "length mismatch")
+}
+
+// Why: the reader must reject non-DV payloads even if the framing size looks
valid.
+// Condition: encoded length is correct for a magic-only payload, but the
magic value is not DVMagicNumber.
+// Assertion: returns an error containing "invalid deletion vector magic".
+func TestDeserializeDVBadMagic(t *testing.T) {
+ data := make([]byte, dvMinSize)
+ binary.BigEndian.PutUint32(data[0:dvLengthSize], dvMagicSize)
+
binary.LittleEndian.PutUint32(data[dvLengthSize:dvLengthSize+dvMagicSize],
0xFFFFFFFF)
+
+ _, err := DeserializeDV(data, -1)
+ assert.ErrorContains(t, err, "invalid deletion vector magic")
+}
+
+// Why: CRC is the DV format's corruption check and should fail before bitmap
decoding.
+// Condition: start from a valid golden DV blob, then flip one byte in the
stored CRC.
+// Assertion: returns an error containing "CRC mismatch".
+func TestDeserializeDVBadCRC(t *testing.T) {
+ data := append([]byte(nil), readDVTestData(t,
"small-alternating-values-position-index.bin")...)
+ data[len(data)-1] ^= 0xFF
+
+ _, err := DeserializeDV(data, -1)
+ assert.ErrorContains(t, err, "CRC mismatch")
+}
+
+// Why: DV tests only need to prove that inner roaring decode failures are
surfaced with DV context.
+// Condition: valid DV envelope containing invalid roaring bitmap bytes and a
matching CRC.
+// Assertion: returns an error containing "deserialize deletion vector bitmap".
+func TestDeserializeDVWrapsBitmapDecodeError(t *testing.T) {
+ data := wrapDVPayloadForTest([]byte{0x00, 0x01, 0x02})
+
+ _, err := DeserializeDV(data, -1)
+ assert.ErrorContains(t, err, "deserialize deletion vector bitmap")
+}
+
+// Why: manifest metadata and DV content should agree on the number of deleted
rows.
+// Condition: valid DV blob with 5 positions, but expected cardinality is set
to 999.
+// Assertion: returns an error containing "cardinality mismatch".
+func TestDeserializeDVCardinalityMismatch(t *testing.T) {
+ data := readDVTestData(t, "small-alternating-values-position-index.bin")
+
+ _, err := DeserializeDV(data, 999)
+ assert.ErrorContains(t, err, "cardinality mismatch")
+}
+
+// Why: this is the production path that ties together Puffin I/O, blob range
selection, and DV parsing.
+// Condition: real Puffin file containing one valid DV blob, with offset and
size taken from Puffin metadata.
+// Assertion: no error, cardinality is 5, expected positions are present, and
an unset position is absent.
+func TestReadDV(t *testing.T) {
+ dvBlobBytes := readDVTestData(t,
"small-alternating-values-position-index.bin")
+
+ dir := t.TempDir()
+ path, meta := writePuffinWithDVBlob(t, dir, dvBlobBytes)
+
+ offset, size := meta.Offset, meta.Length
+ bm, err := ReadDV(iceio.LocalFS{}, newDVTestFile(path, 5, &offset,
&size))
+ require.NoError(t, err)
+
+ assert.Equal(t, int64(5), bm.Cardinality())
+ assert.True(t, bm.Contains(1))
+ assert.True(t, bm.Contains(9))
+ assert.False(t, bm.Contains(2))
+}
+
+// Why: ReadDV should reject callers that pass the wrong file type before
doing any I/O.
+// Condition: DataFile reports Parquet format instead of Puffin.
+// Assertion: returns an error containing "expected PUFFIN format".
+func TestReadDVWrongFormat(t *testing.T) {
+ _, err := ReadDV(iceio.LocalFS{}, &mockDVFile{
+ path: "s3://bucket/data/pos-del.parquet",
+ format: iceberg.ParquetFile,
+ })
+ assert.ErrorContains(t, err, "expected PUFFIN format")
+}
+
+// Why: DV manifest entries require both content offset and content size to
find the blob.
+// Condition: one subtest omits offset and another omits size.
+// Assertion: each case returns an error containing "missing
ContentOffset/ContentSizeInBytes".
+func TestReadDVMissingContentMetadata(t *testing.T) {
+ t.Run("nil offset", func(t *testing.T) {
+ size := int64(50)
+
+ _, err := ReadDV(iceio.LocalFS{},
newDVTestFile("s3://bucket/data/dv.puffin", 0, nil, &size))
+ assert.ErrorContains(t, err, "missing
ContentOffset/ContentSizeInBytes")
+ })
+
+ t.Run("nil size", func(t *testing.T) {
+ offset := int64(4)
+
+ _, err := ReadDV(iceio.LocalFS{},
newDVTestFile("s3://bucket/data/dv.puffin", 0, &offset, nil))
+ assert.ErrorContains(t, err, "missing
ContentOffset/ContentSizeInBytes")
+ })
+}
+
+// Why: negative or absurdly large blob sizes should be rejected before
allocation.
+// Condition: ContentSizeInBytes set to -1.
+// Assertion: returns an error containing "out of valid range".
+func TestReadDVInvalidBlobSize(t *testing.T) {
+ offset := int64(4)
+ negSize := int64(-1)
+
+ _, err := ReadDV(iceio.LocalFS{},
newDVTestFile("s3://bucket/data/dv.puffin", 0, &offset, &negSize))
+ assert.ErrorContains(t, err, "out of valid range")
+}
+
+// Why: storage open failures should be wrapped with file-path context by
ReadDV.
+// Condition: custom IO implementation returns a fixed error from Open.
+// Assertion: error contains both "open DV file missing.puffin" and the
underlying "boom" message.
+func TestReadDVOpenError(t *testing.T) {
+ offset, size := int64(4), int64(16)
+
+ _, err := ReadDV(failingOpenFS{err: errors.New("boom")},
newDVTestFile("missing.puffin", 0, &offset, &size))
+ assert.ErrorContains(t, err, "open DV file missing.puffin")
+ assert.ErrorContains(t, err, "boom")
+}
+
+// Why: ReadDV should surface Puffin container parse failures distinctly from
DV parse failures.
+// Condition: file exists and can be opened, but its contents are not a valid
Puffin file.
+// Assertion: returns an error containing "create puffin reader".
+func TestReadDVInvalidPuffin(t *testing.T) {
+ dir := t.TempDir()
+ path := filepath.Join(dir, "invalid.puffin")
+ require.NoError(t, os.WriteFile(path, []byte("not a puffin file"),
0o644))
+
+ offset, size := int64(4), int64(16)
+ _, err := ReadDV(iceio.LocalFS{}, newDVTestFile(path, 0, &offset,
&size))
+ assert.ErrorContains(t, err, "create puffin reader")
+}
+
+// Why: manifest-provided blob ranges must point into the Puffin blob area,
not arbitrary offsets.
+// Condition: valid Puffin file, but content offset is forced to 0, which
points before the blob region.
+// Assertion: returns an error containing "read DV blob at offset 0".
+func TestReadDVInvalidBlobRange(t *testing.T) {
+ dvBlobBytes := readDVTestData(t,
"small-alternating-values-position-index.bin")
+
+ dir := t.TempDir()
+ path, meta := writePuffinWithDVBlob(t, dir, dvBlobBytes)
+
+ offset, size := int64(0), meta.Length
+ _, err := ReadDV(iceio.LocalFS{}, newDVTestFile(path, 5, &offset,
&size))
+ assert.ErrorContains(t, err, "read DV blob at offset 0")
+}
diff --git a/table/dv/roaring_bitmap.go b/table/dv/roaring_bitmap.go
new file mode 100644
index 00000000..cb25844a
--- /dev/null
+++ b/table/dv/roaring_bitmap.go
@@ -0,0 +1,166 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+package dv
+
+import (
+ "bytes"
+ "encoding/binary"
+ "fmt"
+ "io"
+ "maps"
+ "slices"
+ "sort"
+
+ "github.com/RoaringBitmap/roaring/v2"
+ "github.com/apache/iceberg-go/puffin"
+)
+
+// maxBitmapCount is the maximum number of 32-bit bitmap keys allowed during
+// deserialization. This prevents CPU/memory exhaustion from absurd counts
+// in malformed input. Derived from puffin.DefaultMaxBlobSize / 8 (minimum
+// per-bitmap overhead: 4-byte key + at least 4 bytes of roaring data).
+var maxBitmapCount = uint64(puffin.DefaultMaxBlobSize / 8)
+
+// RoaringPositionBitmap supports 64-bit positions using a sparse map of
+// 32-bit Roaring bitmaps. Positions are split into a 32-bit key
+// (high bits) and 32-bit value (low bits).
+//
+// Compatible with the Java Iceberg RoaringPositionBitmap serialization format.
+type RoaringPositionBitmap struct {
+ bitmaps map[uint32]*roaring.Bitmap
+}
+
+// NewRoaringPositionBitmap creates an empty bitmap.
+func NewRoaringPositionBitmap() *RoaringPositionBitmap {
+ return &RoaringPositionBitmap{
+ bitmaps: make(map[uint32]*roaring.Bitmap),
+ }
+}
+
+// Set marks a position in the bitmap.
+func (b *RoaringPositionBitmap) Set(pos uint64) {
+ key := uint32(pos >> 32)
+ low := uint32(pos)
+ bm, ok := b.bitmaps[key]
+ if !ok {
+ bm = roaring.New()
+ b.bitmaps[key] = bm
+ }
+ bm.Add(low)
+}
+
+// Contains checks if a position is set.
+func (b *RoaringPositionBitmap) Contains(pos uint64) bool {
+ key := uint32(pos >> 32)
+ low := uint32(pos)
+ bm, ok := b.bitmaps[key]
+ if !ok {
+ return false
+ }
+
+ return bm.Contains(low)
+}
+
+// IsEmpty returns true if no positions are set.
+func (b *RoaringPositionBitmap) IsEmpty() bool {
+ return b.Cardinality() == 0
+}
+
+// Cardinality returns the total number of set positions.
+func (b *RoaringPositionBitmap) Cardinality() int64 {
+ var c int64
+ for _, bm := range b.bitmaps {
+ c += int64(bm.GetCardinality())
+ }
+
+ return c
+}
+
+// Serialize writes in the Iceberg portable format (little-endian):
+// - bitmap count (8 bytes, LE): number of non-empty bitmaps
+// - for each bitmap in ascending key order: key (4 bytes, LE) + roaring
portable data
+//
+// Only non-empty bitmaps are written, matching Java Iceberg behavior.
+func (b *RoaringPositionBitmap) Serialize(w io.Writer) error {
+ keys := make([]uint32, 0, len(b.bitmaps))
+ for k, bm := range b.bitmaps {
+ if bm.GetCardinality() > 0 {
+ keys = append(keys, k)
+ }
+ }
+ sort.Slice(keys, func(i, j int) bool { return keys[i] < keys[j] })
+
+ if err := binary.Write(w, binary.LittleEndian, int64(len(keys))); err
!= nil {
+ return fmt.Errorf("write bitmap count: %w", err)
+ }
+ for _, key := range keys {
+ if err := binary.Write(w, binary.LittleEndian, key); err != nil
{
+ return fmt.Errorf("write key %d: %w", key, err)
+ }
+ if _, err := b.bitmaps[key].WriteTo(w); err != nil {
+ return fmt.Errorf("write bitmap %d: %w", key, err)
+ }
+ }
+
+ return nil
+}
+
+// DeserializeRoaringPositionBitmap reads a bitmap from the Iceberg portable
format.
+// Format: [count] { [key][bitmap] } .....{[key_n][bitmap_n]}
+func DeserializeRoaringPositionBitmap(data []byte) (*RoaringPositionBitmap,
error) {
+ r := bytes.NewReader(data)
+
+ var count uint64
+ if err := binary.Read(r, binary.LittleEndian, &count); err != nil {
+ return nil, fmt.Errorf("read bitmap count: %w", err)
+ }
+ if count > maxBitmapCount {
+ return nil, fmt.Errorf("bitmap count %d exceeds maximum allowed
%d", count, maxBitmapCount)
+ }
+
+ b := &RoaringPositionBitmap{
+ bitmaps: make(map[uint32]*roaring.Bitmap, count),
+ }
+ var lastKey uint32
+ hasLastKey := false
+
+ for i := range count {
+ var key uint32
+ if err := binary.Read(r, binary.LittleEndian, &key); err != nil
{
+ return nil, fmt.Errorf("read key %d: %w", i, err)
+ }
+ if hasLastKey && key <= lastKey {
+ return nil, fmt.Errorf("keys must be ascending: got %d
after %d", key, lastKey)
+ }
+
+ bm := roaring.New()
+ if _, err := bm.ReadFrom(r); err != nil {
+ return nil, fmt.Errorf("read bitmap for key %d: %w",
key, err)
+ }
+ b.bitmaps[key] = bm
+ lastKey = key
+ hasLastKey = true
+ }
+
+ return b, nil
+}
+
+// sortedKeys returns the bitmap keys in ascending order.
+func (b *RoaringPositionBitmap) sortedKeys() []uint32 {
+ return slices.Sorted(maps.Keys(b.bitmaps))
+}
diff --git a/table/dv/roaring_bitmap_test.go b/table/dv/roaring_bitmap_test.go
new file mode 100644
index 00000000..f060b7d4
--- /dev/null
+++ b/table/dv/roaring_bitmap_test.go
@@ -0,0 +1,219 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+package dv
+
+import (
+ "bytes"
+ "encoding/binary"
+ "testing"
+
+ "github.com/stretchr/testify/assert"
+ "github.com/stretchr/testify/require"
+)
+
+// Why: this type owns compatibility with Iceberg's Java roaring bitmap
format, including the empty case.
+// Condition: deserialize a Java-produced bitmap with zero stored positions.
+// Assertion: no error, bitmap is empty, and cardinality is 0.
+func TestDeserializeRoaringBitmapJavaEmpty(t *testing.T) {
+ data := readDVTestData(t, "64mapempty.bin")
+
+ bm, err := DeserializeRoaringPositionBitmap(data)
+ require.NoError(t, err)
+
+ assert.True(t, bm.IsEmpty())
+ assert.Equal(t, int64(0), bm.Cardinality())
+}
+
+// Why: the bitmap layer owns 64-bit position mapping across multiple high-bit
keys.
+// Condition: deserialize a Java-produced bitmap with keys 0..9 and low values
0..9.
+// Assertion: no error, cardinality is 100, representative positions across
keys exist, and a position beyond the stored range is absent.
+func TestDeserializeRoaringBitmapJavaSpreadValues(t *testing.T) {
+ data := readDVTestData(t, "64mapspreadvals.bin")
+
+ bm, err := DeserializeRoaringPositionBitmap(data)
+ require.NoError(t, err)
+
+ assert.Equal(t, int64(100), bm.Cardinality())
+ assert.True(t, bm.Contains(0))
+ assert.True(t, bm.Contains((uint64(3)<<32)|7))
+ assert.True(t, bm.Contains((uint64(9)<<32)|9))
+ assert.False(t, bm.Contains(uint64(10)<<32))
+}
+
+// Why: validates cross-impl compatibility for a simple 32-bit-only bitmap.
+// Condition: deserialize a Java-produced bitmap with positions [0..9] in a
single key.
+// Assertion: no error, cardinality is 10, all expected positions present.
+func TestDeserializeRoaringBitmapJava32BitValues(t *testing.T) {
+ data := readDVTestData(t, "64map32bitvals.bin")
+
+ bm, err := DeserializeRoaringPositionBitmap(data)
+ require.NoError(t, err)
+
+ assert.Equal(t, int64(10), bm.Cardinality())
+ for i := uint64(0); i < 10; i++ {
+ assert.True(t, bm.Contains(i), "expected position %d to be
set", i)
+ }
+ assert.False(t, bm.Contains(10))
+}
+
+// Why: the deserializer must fail cleanly when the outer bitmap count cannot
be read.
+// Condition: empty input stream.
+// Assertion: returns an error containing "read bitmap count".
+func TestDeserializeRoaringBitmapTruncatedInput(t *testing.T) {
+ _, err := DeserializeRoaringPositionBitmap(nil)
+ assert.ErrorContains(t, err, "read bitmap count")
+}
+
+// Why: counts with the high bit set (e.g. an int64 -1 written to disk) decode
+// as a huge uint64 and must be rejected by the upper-bound check, not silently
+// accepted as a small value or panicked on by make(map, hugeHint).
+// Condition: count field encoded as int64(-1) (= 0xFFFF_FFFF_FFFF_FFFF on
disk).
+// Assertion: returns an error containing "exceeds maximum".
+func TestDeserializeRoaringBitmapHighBitCount(t *testing.T) {
+ var buf bytes.Buffer
+ require.NoError(t, binary.Write(&buf, binary.LittleEndian, int64(-1)))
+
+ _, err := DeserializeRoaringPositionBitmap(buf.Bytes())
+ assert.ErrorContains(t, err, "exceeds maximum")
+}
+
+// Why: absurdly large counts should be rejected to prevent CPU/memory
exhaustion.
+// Condition: count field set to maxBitmapCount + 1.
+// Assertion: returns an error containing "exceeds maximum".
+func TestDeserializeRoaringBitmapExcessiveCount(t *testing.T) {
+ var buf bytes.Buffer
+ require.NoError(t, binary.Write(&buf, binary.LittleEndian,
maxBitmapCount+1))
+
+ _, err := DeserializeRoaringPositionBitmap(buf.Bytes())
+ assert.ErrorContains(t, err, "exceeds maximum")
+}
+
+// Why: each bitmap entry must start with a key; premature EOF before that key
is a distinct decode failure.
+// Condition: count says 1, but no key bytes follow.
+// Assertion: returns an error containing "read key 0".
+func TestDeserializeRoaringBitmapTruncatedBeforeKey(t *testing.T) {
+ var buf bytes.Buffer
+ require.NoError(t, binary.Write(&buf, binary.LittleEndian, int64(1)))
+
+ _, err := DeserializeRoaringPositionBitmap(buf.Bytes())
+ assert.ErrorContains(t, err, "read key 0")
+}
+
+// Why: the on-wire format requires keys to be strictly ascending so the
decoder can rebuild the sparse key space correctly.
+// Condition: encoded bitmap count is 2, but entries are written with key 5
before key 3.
+// Assertion: returns an error containing "keys must be ascending".
+func TestDeserializeRoaringBitmapNonAscendingKeys(t *testing.T) {
+ bm := NewRoaringPositionBitmap()
+ bm.Set((uint64(5) << 32) | 1)
+ bm.Set((uint64(3) << 32) | 1)
+
+ var buf bytes.Buffer
+ require.NoError(t, binary.Write(&buf, binary.LittleEndian, int64(2)))
+ require.NoError(t, binary.Write(&buf, binary.LittleEndian, uint32(5)))
+ _, err := bm.bitmaps[5].WriteTo(&buf)
+ require.NoError(t, err)
+ require.NoError(t, binary.Write(&buf, binary.LittleEndian, uint32(3)))
+ _, err = bm.bitmaps[3].WriteTo(&buf)
+ require.NoError(t, err)
+
+ _, err = DeserializeRoaringPositionBitmap(buf.Bytes())
+ assert.ErrorContains(t, err, "keys must be ascending")
+}
+
+// Why: after a valid key is read, the decoder still needs a full roaring
bitmap payload for that key.
+// Condition: count says 1 and key 0 is present, but no roaring bitmap bytes
follow.
+// Assertion: returns an error containing "read bitmap for key 0".
+func TestDeserializeRoaringBitmapTruncatedAfterKey(t *testing.T) {
+ var buf bytes.Buffer
+ require.NoError(t, binary.Write(&buf, binary.LittleEndian, int64(1)))
+ require.NoError(t, binary.Write(&buf, binary.LittleEndian, uint32(0)))
+
+ _, err := DeserializeRoaringPositionBitmap(buf.Bytes())
+ assert.ErrorContains(t, err, "read bitmap for key 0")
+}
+
+// Why: Set, Contains, Cardinality, and gap handling are the core in-memory
behaviors of this type.
+// Condition: set positions across keys 0, 1, and 3, leaving key 2 absent.
+// Assertion: cardinality counts all set positions, expected positions are
present, and unset positions in the same key, a gap key, and a far key are
absent.
+func TestRoaringBitmapSetContainsAndCardinality(t *testing.T) {
+ bm := NewRoaringPositionBitmap()
+
+ bm.Set(0)
+ bm.Set(42)
+ bm.Set(1000)
+ bm.Set((uint64(1) << 32) | 5)
+ bm.Set((uint64(1) << 32) | 999)
+ bm.Set((uint64(3) << 32) | 1)
+
+ assert.False(t, bm.IsEmpty())
+ assert.Equal(t, int64(6), bm.Cardinality())
+ assert.True(t, bm.Contains(0))
+ assert.True(t, bm.Contains((uint64(1)<<32)|999))
+ assert.True(t, bm.Contains((uint64(3)<<32)|1))
+ assert.False(t, bm.Contains(1))
+ assert.False(t, bm.Contains((uint64(1)<<32)|6))
+ assert.False(t, bm.Contains((uint64(2)<<32)|1))
+ assert.False(t, bm.Contains(uint64(100)<<32))
+}
+
+// Why: Serialize and DeserializeRoaringPositionBitmap together define the Go
encoding contract for non-empty bitmaps.
+// Condition: round-trip a bitmap with positions spread across multiple keys
and an internal key gap.
+// Assertion: serialization succeeds, deserialization succeeds, cardinality is
preserved, and all original positions remain present.
+func TestRoaringBitmapSerializeRoundTrip(t *testing.T) {
+ bm := NewRoaringPositionBitmap()
+ positions := []uint64{
+ 0,
+ 1,
+ 100,
+ 65535,
+ (uint64(1) << 32) | 42,
+ (uint64(1) << 32) | 9999,
+ uint64(5) << 32,
+ (uint64(5) << 32) | 1,
+ }
+ for _, pos := range positions {
+ bm.Set(pos)
+ }
+
+ var buf bytes.Buffer
+ require.NoError(t, bm.Serialize(&buf))
+
+ got, err := DeserializeRoaringPositionBitmap(buf.Bytes())
+ require.NoError(t, err)
+
+ assert.Equal(t, bm.Cardinality(), got.Cardinality())
+ for _, pos := range positions {
+ assert.True(t, got.Contains(pos), "round-trip lost position
%d", pos)
+ }
+}
+
+// Why: empty serialization is a separate boundary case because the on-wire
count is zero and no key/bitmap pairs follow.
+// Condition: round-trip an empty bitmap through Serialize and
DeserializeRoaringPositionBitmap.
+// Assertion: serialization succeeds, deserialization succeeds, bitmap is
empty, and cardinality is 0.
+func TestRoaringBitmapEmptyRoundTrip(t *testing.T) {
+ bm := NewRoaringPositionBitmap()
+
+ var buf bytes.Buffer
+ require.NoError(t, bm.Serialize(&buf))
+
+ got, err := DeserializeRoaringPositionBitmap(buf.Bytes())
+ require.NoError(t, err)
+
+ assert.True(t, got.IsEmpty())
+ assert.Equal(t, int64(0), got.Cardinality())
+}
diff --git a/table/dv/testdata/deletes/64map32bitvals.bin
b/table/dv/testdata/deletes/64map32bitvals.bin
new file mode 100644
index 00000000..475b8944
Binary files /dev/null and b/table/dv/testdata/deletes/64map32bitvals.bin differ
diff --git a/table/dv/testdata/deletes/64mapempty.bin
b/table/dv/testdata/deletes/64mapempty.bin
new file mode 100644
index 00000000..1b1cb4d4
Binary files /dev/null and b/table/dv/testdata/deletes/64mapempty.bin differ
diff --git a/table/dv/testdata/deletes/64mapspreadvals.bin
b/table/dv/testdata/deletes/64mapspreadvals.bin
new file mode 100644
index 00000000..83c72f6b
Binary files /dev/null and b/table/dv/testdata/deletes/64mapspreadvals.bin
differ
diff --git a/table/dv/testdata/deletes/README.md
b/table/dv/testdata/deletes/README.md
new file mode 100644
index 00000000..03b89aed
--- /dev/null
+++ b/table/dv/testdata/deletes/README.md
@@ -0,0 +1,21 @@
+<!--
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements. See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership. The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied. See the License for the
+specific language governing permissions and limitations
+under the License.
+-->
+
+These test fixture files are canonical deletion vector and roaring bitmap
files from the Apache Iceberg Java implementation:
+https://github.com/apache/iceberg/tree/main/core/src/test/resources/org/apache/iceberg/deletes
diff --git a/table/dv/testdata/deletes/all-container-types-position-index.bin
b/table/dv/testdata/deletes/all-container-types-position-index.bin
new file mode 100644
index 00000000..00d47303
Binary files /dev/null and
b/table/dv/testdata/deletes/all-container-types-position-index.bin differ
diff --git a/table/dv/testdata/deletes/empty-position-index.bin
b/table/dv/testdata/deletes/empty-position-index.bin
new file mode 100644
index 00000000..8bbc1265
Binary files /dev/null and b/table/dv/testdata/deletes/empty-position-index.bin
differ
diff --git
a/table/dv/testdata/deletes/small-alternating-values-position-index.bin
b/table/dv/testdata/deletes/small-alternating-values-position-index.bin
new file mode 100644
index 00000000..80829fae
Binary files /dev/null and
b/table/dv/testdata/deletes/small-alternating-values-position-index.bin differ
diff --git
a/table/dv/testdata/deletes/small-and-large-values-position-index.bin
b/table/dv/testdata/deletes/small-and-large-values-position-index.bin
new file mode 100644
index 00000000..989dabf6
Binary files /dev/null and
b/table/dv/testdata/deletes/small-and-large-values-position-index.bin differ
diff --git a/table/dv_scan_planning_test.go b/table/dv_scan_planning_test.go
index fe6ebe68..93278369 100644
--- a/table/dv_scan_planning_test.go
+++ b/table/dv_scan_planning_test.go
@@ -24,7 +24,8 @@ import (
"github.com/stretchr/testify/assert"
)
-// dvMockDataFile extends mockDataFile with DV fields.
+// dvMockDataFile extends mockDataFile with the DV-specific fields
+// (referenced data file, content offset/size) needed by scan planning tests.
type dvMockDataFile struct {
mockDataFile
referencedDataFile *string