This is an automated email from the ASF dual-hosted git repository.

zeroshade pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/iceberg-go.git


The following commit(s) were added to refs/heads/main by this push:
     new 774c8add feat: Deletion vector reader (#866)
774c8add is described below

commit 774c8addea96522897e161216ebd456c96165406
Author: Shreyas Mishra <[email protected]>
AuthorDate: Sat May 9 01:30:31 2026 +0530

    feat: Deletion vector reader (#866)
    
    Signed-off-by: Shreyas220 <[email protected]>
---
 go.mod                                             |   3 +
 go.sum                                             |   6 +
 manifest.go                                        |  16 +-
 table/dv/deletion_vector.go                        | 132 +++++++
 table/dv/deletion_vector_test.go                   | 382 +++++++++++++++++++++
 table/dv/roaring_bitmap.go                         | 166 +++++++++
 table/dv/roaring_bitmap_test.go                    | 219 ++++++++++++
 table/dv/testdata/deletes/64map32bitvals.bin       | Bin 0 -> 48 bytes
 table/dv/testdata/deletes/64mapempty.bin           | Bin 0 -> 8 bytes
 table/dv/testdata/deletes/64mapspreadvals.bin      | Bin 0 -> 408 bytes
 table/dv/testdata/deletes/README.md                |  21 ++
 .../deletes/all-container-types-position-index.bin | Bin 0 -> 94 bytes
 table/dv/testdata/deletes/empty-position-index.bin | Bin 0 -> 20 bytes
 .../small-alternating-values-position-index.bin    | Bin 0 -> 50 bytes
 .../small-and-large-values-position-index.bin      | Bin 0 -> 56 bytes
 table/dv_scan_planning_test.go                     |   3 +-
 16 files changed, 944 insertions(+), 4 deletions(-)

diff --git a/go.mod b/go.mod
index ccbcc823..1f94eaf8 100644
--- a/go.mod
+++ b/go.mod
@@ -23,6 +23,7 @@ require (
        cloud.google.com/go/storage v1.62.1
        github.com/Azure/azure-sdk-for-go/sdk/azidentity v1.13.1
        github.com/Azure/azure-sdk-for-go/sdk/storage/azblob v1.6.4
+       github.com/RoaringBitmap/roaring/v2 v2.16.0
        github.com/alexflint/go-arg v1.6.1
        github.com/apache/arrow-go/v18 v18.6.0
        github.com/aws/aws-sdk-go-v2 v1.41.6
@@ -104,6 +105,7 @@ require (
        github.com/aws/aws-sdk-go-v2/service/sts v1.42.0 // indirect
        github.com/beltran/gosasl v1.0.0 // indirect
        github.com/beltran/gssapi v0.0.0-20200324152954-d86554db4bab // indirect
+       github.com/bits-and-blooms/bitset v1.24.2 // indirect
        github.com/buger/goterm v1.0.4 // indirect
        github.com/cenkalti/backoff/v4 v4.3.0 // indirect
        github.com/cenkalti/backoff/v5 v5.0.3 // indirect
@@ -197,6 +199,7 @@ require (
        github.com/moby/sys/userns v0.1.0 // indirect
        github.com/moby/term v0.5.2 // indirect
        github.com/morikuni/aec v1.1.0 // indirect
+       github.com/mschoch/smat v0.2.0 // indirect
        github.com/ncruces/go-strftime v1.0.0 // indirect
        github.com/opencontainers/go-digest v1.0.0 // indirect
        github.com/opencontainers/image-spec v1.1.1 // indirect
diff --git a/go.sum b/go.sum
index ecc67860..7735421d 100644
--- a/go.sum
+++ b/go.sum
@@ -81,6 +81,8 @@ github.com/Microsoft/hcsshim v0.14.0-rc.1 
h1:qAPXKwGOkVn8LlqgBN8GS0bxZ83hOJpcjxz
 github.com/Microsoft/hcsshim v0.14.0-rc.1/go.mod 
h1:hTKFGbnDtQb1wHiOWv4v0eN+7boSWAHyK/tNAaYZL0c=
 github.com/ProtonMail/go-crypto v1.3.0 
h1:ILq8+Sf5If5DCpHQp4PbZdS1J7HDFRXz/+xKBiRGFrw=
 github.com/ProtonMail/go-crypto v1.3.0/go.mod 
h1:9whxjD8Rbs29b4XWbB8irEcE8KHMqaR2e7GWU1R+/PE=
+github.com/RoaringBitmap/roaring/v2 v2.16.0 
h1:Kys1UNf49d5W8Tq3bpuAhIr/Z8/yPB+59CO8A6c/BbE=
+github.com/RoaringBitmap/roaring/v2 v2.16.0/go.mod 
h1:eq4wdNXxtJIS/oikeCzdX1rBzek7ANzbth041hrU8Q4=
 github.com/acarl005/stripansi v0.0.0-20180116102854-5a71ef0e047d 
h1:licZJFw2RwpHMqeKTCYkitsPqHNxTmd4SNR5r94FGM8=
 github.com/acarl005/stripansi v0.0.0-20180116102854-5a71ef0e047d/go.mod 
h1:asat636LX7Bqt5lYEZ27JNDcqxfjdBQuJ/MM4CN/Lzo=
 github.com/alexflint/go-arg v1.6.1 
h1:uZogJ6VDBjcuosydKgvYYRhh9sRCusjOvoOLZopBlnA=
@@ -150,6 +152,8 @@ github.com/beltran/gssapi 
v0.0.0-20200324152954-d86554db4bab h1:ayfcn60tXOSYy5zU
 github.com/beltran/gssapi v0.0.0-20200324152954-d86554db4bab/go.mod 
h1:GLe4UoSyvJ3cVG+DVtKen5eAiaD8mAJFuV5PT3Eeg9Q=
 github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM=
 github.com/beorn7/perks v1.0.1/go.mod 
h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw=
+github.com/bits-and-blooms/bitset v1.24.2 
h1:M7/NzVbsytmtfHbumG+K2bremQPMJuqv1JD3vOaFxp0=
+github.com/bits-and-blooms/bitset v1.24.2/go.mod 
h1:7hO7Gc7Pp1vODcmWvKMRA9BNmbv6a/7QIWpPxHddWR8=
 github.com/blang/semver v3.5.1+incompatible 
h1:cQNTCjp13qL8KC3Nbxr/y2Bqb63oX6wdnnjpJbkM4JQ=
 github.com/blang/semver v3.5.1+incompatible/go.mod 
h1:kRBLl5iJ+tD4TcOOxsy/0fnwebNt5EWlYSAyrTnjyyk=
 github.com/buger/goterm v1.0.4 h1:Z9YvGmOih81P0FbVtEYTFF6YsSgxSUKEhf/f9bTMXbY=
@@ -475,6 +479,8 @@ github.com/moby/term v0.5.2 
h1:6qk3FJAFDs6i/q3W/pQ97SX192qKfZgGjCQqfCJkgzQ=
 github.com/moby/term v0.5.2/go.mod 
h1:d3djjFCrjnB+fl8NJux+EJzu0msscUP+f8it8hPkFLc=
 github.com/morikuni/aec v1.1.0 h1:vBBl0pUnvi/Je71dsRrhMBtreIqNMYErSAbEeb8jrXQ=
 github.com/morikuni/aec v1.1.0/go.mod 
h1:xDRgiq/iw5l+zkao76YTKzKttOp2cwPEne25HDkJnBw=
+github.com/mschoch/smat v0.2.0 h1:8imxQsjDm8yFEAVBe7azKmKSgzSkZXDuKkSq9374khM=
+github.com/mschoch/smat v0.2.0/go.mod 
h1:kc9mz7DoBKqDyiRL7VZN8KvXQMWeTaVnttLRXOlotKw=
 github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 
h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA=
 github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod 
h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ=
 github.com/ncruces/go-strftime v1.0.0 
h1:HMFp8mLCTPp341M/ZnA4qaf7ZlsbTc+miZjCLOFAw7w=
diff --git a/manifest.go b/manifest.go
index 3de9c128..2efefe8a 100644
--- a/manifest.go
+++ b/manifest.go
@@ -1646,6 +1646,7 @@ const (
        AvroFile    FileFormat = "AVRO"
        OrcFile     FileFormat = "ORC"
        ParquetFile FileFormat = "PARQUET"
+       PuffinFile  FileFormat = "PUFFIN"
 )
 
 // FileFormatFromString parses a file format string (case-insensitive).
@@ -1657,6 +1658,8 @@ func FileFormatFromString(s string) (FileFormat, error) {
                return OrcFile, nil
        case string(AvroFile):
                return AvroFile, nil
+       case string(PuffinFile):
+               return PuffinFile, nil
        default:
                return "", fmt.Errorf("unknown file format: %s", s)
        }
@@ -2152,10 +2155,17 @@ func NewDataFileBuilder(
                return nil, fmt.Errorf("%w: path cannot be empty", 
ErrInvalidArgument)
        }
 
-       if format != AvroFile && format != OrcFile && format != ParquetFile {
+       if format != AvroFile && format != OrcFile && format != ParquetFile && 
format != PuffinFile {
                return nil, fmt.Errorf(
-                       "%w: format must be one of %s, %s, or %s",
-                       ErrInvalidArgument, AvroFile, OrcFile, ParquetFile,
+                       "%w: format must be one of %s, %s, %s, or %s",
+                       ErrInvalidArgument, AvroFile, OrcFile, ParquetFile, 
PuffinFile,
+               )
+       }
+
+       if format == PuffinFile && content != EntryContentPosDeletes {
+               return nil, fmt.Errorf(
+                       "%w: %s format is only valid for %s content",
+                       ErrInvalidArgument, PuffinFile, EntryContentPosDeletes,
                )
        }
 
diff --git a/table/dv/deletion_vector.go b/table/dv/deletion_vector.go
new file mode 100644
index 00000000..2b618c89
--- /dev/null
+++ b/table/dv/deletion_vector.go
@@ -0,0 +1,132 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+package dv
+
+import (
+       "encoding/binary"
+       "fmt"
+       "hash/crc32"
+
+       "github.com/apache/iceberg-go"
+       iceio "github.com/apache/iceberg-go/io"
+       "github.com/apache/iceberg-go/puffin"
+)
+
+const (
+       // DVMagicNumber is the magic number for deletion vectors.
+       // Spec bytes: D1 D3 39 64 (big-endian) = 0x6439D3D1 (little-endian 
uint32)
+       DVMagicNumber uint32 = 0x6439D3D1
+
+       dvLengthSize = 4 // length field
+       dvMagicSize  = 4 // magic field
+       dvCRCSize    = 4 // CRC-32 checksum
+       dvMinSize    = dvLengthSize + dvMagicSize + dvCRCSize
+)
+
+// DeserializeDV parses a deletion vector blob and returns a bitmap of deleted 
positions.
+//
+// The DV binary format is:
+//   - Length (4 bytes, big-endian): size of magic + bitmap data, excluding 
CRC-32
+//   - Magic  (4 bytes, little-endian): must be 0x6439D3D1
+//   - Bitmap (variable): roaring bitmap in Iceberg portable format
+//   - CRC-32 (4 bytes, big-endian): checksum over magic + bitmap
+//
+// If expectedCardinality >= 0, the bitmap's cardinality is validated against 
it.
+func DeserializeDV(data []byte, expectedCardinality int64) 
(*RoaringPositionBitmap, error) {
+       if len(data) < dvMinSize {
+               return nil, fmt.Errorf("deletion vector payload too short: %d 
bytes (minimum %d)", len(data), dvMinSize)
+       }
+
+       // 1. Read and validate length
+       length := binary.BigEndian.Uint32(data[0:dvLengthSize])
+       expectedLength := uint32(len(data) - dvLengthSize - dvCRCSize)
+       if length != expectedLength {
+               return nil, fmt.Errorf("deletion vector length mismatch: got 
%d, expected %d", length, expectedLength)
+       }
+
+       // 2. Read and validate magic
+       magic := binary.LittleEndian.Uint32(data[dvLengthSize : 
dvLengthSize+dvMagicSize])
+       if magic != DVMagicNumber {
+               return nil, fmt.Errorf("invalid deletion vector magic: 0x%08x, 
expected 0x%08x", magic, DVMagicNumber)
+       }
+
+       // 3. Verify CRC-32 over magic + bitmap (bytes 4 to len-4)
+       bitmapDataStart := dvLengthSize
+       bitmapDataEnd := len(data) - dvCRCSize
+       computedCRC := crc32.ChecksumIEEE(data[bitmapDataStart:bitmapDataEnd])
+       expectedCRC := binary.BigEndian.Uint32(data[bitmapDataEnd:])
+       if computedCRC != expectedCRC {
+               return nil, fmt.Errorf("deletion vector CRC mismatch: computed 
0x%08x, expected 0x%08x", computedCRC, expectedCRC)
+       }
+
+       // 4. Deserialize roaring bitmap from the inner bytes (after length + 
magic, before CRC)
+       roaringStart := dvLengthSize + dvMagicSize
+       bitmap, err := 
DeserializeRoaringPositionBitmap(data[roaringStart:bitmapDataEnd])
+       if err != nil {
+               return nil, fmt.Errorf("deserialize deletion vector bitmap: 
%w", err)
+       }
+
+       // 5. Validate cardinality if requested
+       if expectedCardinality >= 0 {
+               actual := bitmap.Cardinality()
+               if actual != expectedCardinality {
+                       return nil, fmt.Errorf("deletion vector cardinality 
mismatch: got %d, expected %d", actual, expectedCardinality)
+               }
+       }
+
+       return bitmap, nil
+}
+
+// ReadDV reads a deletion vector from a puffin file using the manifest entry 
metadata.
+// ContentOffset and ContentSizeInBytes must be set on the DataFile (required 
by v3 spec).
+func ReadDV(fs iceio.IO, dvFile iceberg.DataFile) (*RoaringPositionBitmap, 
error) {
+       if dvFile.FileFormat() != iceberg.PuffinFile {
+               return nil, fmt.Errorf("expected PUFFIN format for deletion 
vector, got %s", dvFile.FileFormat())
+       }
+
+       if dvFile.ContentOffset() == nil || dvFile.ContentSizeInBytes() == nil {
+               return nil, fmt.Errorf("DV file %s missing 
ContentOffset/ContentSizeInBytes", dvFile.FilePath())
+       }
+
+       size := *dvFile.ContentSizeInBytes()
+       if size < 0 || size > int64(puffin.DefaultMaxBlobSize) {
+               return nil, fmt.Errorf("DV blob size %d out of valid range [0, 
%d]", size, puffin.DefaultMaxBlobSize)
+       }
+
+       f, err := fs.Open(dvFile.FilePath())
+       if err != nil {
+               return nil, fmt.Errorf("open DV file %s: %w", 
dvFile.FilePath(), err)
+       }
+       defer f.Close()
+
+       reader, err := puffin.NewReader(f)
+       if err != nil {
+               return nil, fmt.Errorf("create puffin reader for %s: %w", 
dvFile.FilePath(), err)
+       }
+
+       offset := *dvFile.ContentOffset()
+       blobData := make([]byte, size)
+       if _, err := reader.ReadAt(blobData, offset); err != nil {
+               return nil, fmt.Errorf("read DV blob at offset %d: %w", offset, 
err)
+       }
+
+       // Pass -1 to skip cardinality validation during deserialization.
+       // dvFile.Count() defaults to 0 when unset, which would incorrectly
+       // reject valid DVs. Callers can validate cardinality separately.
+       return DeserializeDV(blobData, -1)
+}
diff --git a/table/dv/deletion_vector_test.go b/table/dv/deletion_vector_test.go
new file mode 100644
index 00000000..1ea87142
--- /dev/null
+++ b/table/dv/deletion_vector_test.go
@@ -0,0 +1,382 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+package dv
+
+import (
+       "encoding/binary"
+       "errors"
+       "hash/crc32"
+       "os"
+       "path/filepath"
+       "testing"
+
+       "github.com/apache/iceberg-go"
+       iceio "github.com/apache/iceberg-go/io"
+       "github.com/apache/iceberg-go/puffin"
+       "github.com/stretchr/testify/assert"
+       "github.com/stretchr/testify/require"
+)
+
+// mockDVFile is a minimal iceberg.DataFile implementation with only the
+// fields exercised by the DV reader: path, format, count, and the DV-specific
+// offset/size/referenced-data-file pointers.
+type mockDVFile struct {
+       path               string
+       format             iceberg.FileFormat
+       count              int64
+       referencedDataFile *string
+       contentOffset      *int64
+       contentSizeInBytes *int64
+}
+
+func (m *mockDVFile) FilePath() string                        { return m.path }
+func (m *mockDVFile) FileFormat() iceberg.FileFormat          { return 
m.format }
+func (m *mockDVFile) Count() int64                            { return m.count 
}
+func (m *mockDVFile) ReferencedDataFile() *string             { return 
m.referencedDataFile }
+func (m *mockDVFile) ContentOffset() *int64                   { return 
m.contentOffset }
+func (m *mockDVFile) ContentSizeInBytes() *int64              { return 
m.contentSizeInBytes }
+func (*mockDVFile) ContentType() iceberg.ManifestEntryContent { return 
iceberg.EntryContentPosDeletes }
+func (*mockDVFile) Partition() map[int]any                    { return nil }
+func (*mockDVFile) FileSizeBytes() int64                      { return 0 }
+func (*mockDVFile) ColumnSizes() map[int]int64                { return nil }
+func (*mockDVFile) ValueCounts() map[int]int64                { return nil }
+func (*mockDVFile) NullValueCounts() map[int]int64            { return nil }
+func (*mockDVFile) NaNValueCounts() map[int]int64             { return nil }
+func (*mockDVFile) DistinctValueCounts() map[int]int64        { return nil }
+func (*mockDVFile) LowerBoundValues() map[int][]byte          { return nil }
+func (*mockDVFile) UpperBoundValues() map[int][]byte          { return nil }
+func (*mockDVFile) KeyMetadata() []byte                       { return nil }
+func (*mockDVFile) SplitOffsets() []int64                     { return nil }
+func (*mockDVFile) EqualityFieldIDs() []int                   { return nil }
+func (*mockDVFile) SortOrderID() *int                         { return nil }
+func (*mockDVFile) SpecID() int32                             { return 0 }
+func (*mockDVFile) FirstRowID() *int64                        { return nil }
+
+func strPtr(s string) *string { return &s }
+
+func readDVTestData(t *testing.T, name string) []byte {
+       t.Helper()
+       data, err := os.ReadFile(filepath.Join("testdata", "deletes", name))
+       require.NoError(t, err)
+
+       return data
+}
+
+func wrapDVPayloadForTest(bitmapBytes []byte) []byte {
+       bitmapDataLength := dvMagicSize + len(bitmapBytes)
+       totalSize := dvLengthSize + bitmapDataLength + dvCRCSize
+       out := make([]byte, totalSize)
+
+       binary.BigEndian.PutUint32(out[0:dvLengthSize], 
uint32(bitmapDataLength))
+       
binary.LittleEndian.PutUint32(out[dvLengthSize:dvLengthSize+dvMagicSize], 
DVMagicNumber)
+       copy(out[dvLengthSize+dvMagicSize:], bitmapBytes)
+
+       crc := crc32.ChecksumIEEE(out[dvLengthSize : totalSize-dvCRCSize])
+       binary.BigEndian.PutUint32(out[totalSize-dvCRCSize:], crc)
+
+       return out
+}
+
+func writePuffinWithDVBlob(t *testing.T, dir string, dvBlobBytes []byte) 
(string, puffin.BlobMetadata) {
+       t.Helper()
+
+       path := filepath.Join(dir, "test-dv.puffin")
+       f, err := os.Create(path)
+       require.NoError(t, err)
+       defer f.Close()
+
+       w, err := puffin.NewWriter(f)
+       require.NoError(t, err)
+
+       meta, err := w.AddBlob(puffin.BlobMetadataInput{
+               Type:           puffin.BlobTypeDeletionVector,
+               SnapshotID:     -1,
+               SequenceNumber: -1,
+               Fields:         []int32{2147483546},
+               Properties: map[string]string{
+                       "referenced-data-file": 
"s3://bucket/data/data-001.parquet",
+               },
+       }, dvBlobBytes)
+       require.NoError(t, err)
+       require.NoError(t, w.Finish())
+
+       return path, meta
+}
+
+func newDVTestFile(path string, count int64, offset, size *int64) *mockDVFile {
+       return &mockDVFile{
+               path:               path,
+               format:             iceberg.PuffinFile,
+               count:              count,
+               referencedDataFile: strPtr("s3://bucket/data/data-001.parquet"),
+               contentOffset:      offset,
+               contentSizeInBytes: size,
+       }
+}
+
+type failingOpenFS struct {
+       err error
+}
+
+func (f failingOpenFS) Open(string) (iceio.File, error) { return nil, f.err }
+func (f failingOpenFS) Remove(string) error             { return nil }
+
+// Why: proves the DV envelope can successfully decode a representative valid 
blob.
+// Condition: Java-produced DV with deleted positions [1, 3, 5, 7, 9] and 
cardinality validation disabled.
+// Assertion: no error, cardinality is 5, expected positions are present, and 
an adjacent unset position is absent.
+func TestDeserializeDV(t *testing.T) {
+       data := readDVTestData(t, "small-alternating-values-position-index.bin")
+
+       bm, err := DeserializeDV(data, -1)
+       require.NoError(t, err)
+
+       assert.Equal(t, int64(5), bm.Cardinality())
+       assert.True(t, bm.Contains(1))
+       assert.True(t, bm.Contains(9))
+       assert.False(t, bm.Contains(2))
+}
+
+// Why: validates the DV envelope for the empty case — zero deleted positions.
+// Condition: Java-produced DV with no positions.
+// Assertion: no error, bitmap is empty.
+func TestDeserializeDVEmpty(t *testing.T) {
+       data := readDVTestData(t, "empty-position-index.bin")
+
+       bm, err := DeserializeDV(data, -1)
+       require.NoError(t, err)
+
+       assert.True(t, bm.IsEmpty())
+       assert.Equal(t, int64(0), bm.Cardinality())
+}
+
+// Why: exercises all three roaring container types (array, run, bitmap) 
through the DV envelope,
+// which is the main cross-library bug source.
+// Condition: Java-produced DV with 132,561 positions across 2 keys (0 and 1), 
each with 3 containers.
+// Assertion: cardinality matches, representative positions from each 
container type are present.
+func TestDeserializeDVAllContainerTypes(t *testing.T) {
+       data := readDVTestData(t, "all-container-types-position-index.bin")
+
+       bm, err := DeserializeDV(data, -1)
+       require.NoError(t, err)
+
+       assert.Equal(t, int64(132561), bm.Cardinality())
+
+       // Key 0, array container: positions 5 and 7
+       assert.True(t, bm.Contains(5))
+       assert.True(t, bm.Contains(7))
+       assert.False(t, bm.Contains(6))
+
+       // Key 0, run container: positions 65537..66535 (container 1, 999 
values)
+       assert.True(t, bm.Contains(65537))
+       assert.True(t, bm.Contains(66535))
+       assert.False(t, bm.Contains(65536))
+       assert.False(t, bm.Contains(66536))
+
+       // Key 0, bitmap container: positions 131073..196606 (container 2, 
65534 values)
+       assert.True(t, bm.Contains(131073))
+       assert.True(t, bm.Contains(196606))
+       assert.False(t, bm.Contains(131072))
+
+       // Key 1, array container: positions (1<<32)|10 and (1<<32)|20
+       assert.True(t, bm.Contains((uint64(1)<<32)|10))
+       assert.True(t, bm.Contains((uint64(1)<<32)|20))
+
+       // Key 1, run container: starts at (1<<32)|65546
+       assert.True(t, bm.Contains((uint64(1)<<32)|65546))
+
+       // Key 1, bitmap container: starts at (1<<32)|131073
+       assert.True(t, bm.Contains((uint64(1)<<32)|131073))
+}
+
+// Why: validates DV decoding with values that span both small and large 
32-bit ranges in a single key.
+// Condition: Java-produced DV with positions [100, 101, 2147483747, 
2147483748].
+// Assertion: cardinality is 4, all positions present, adjacent unset 
positions absent.
+func TestDeserializeDVSmallAndLargeValues(t *testing.T) {
+       data := readDVTestData(t, "small-and-large-values-position-index.bin")
+
+       bm, err := DeserializeDV(data, -1)
+       require.NoError(t, err)
+
+       assert.Equal(t, int64(4), bm.Cardinality())
+       assert.True(t, bm.Contains(100))
+       assert.True(t, bm.Contains(101))
+       assert.True(t, bm.Contains(2147483747)) // Integer.MAX_VALUE + 100
+       assert.True(t, bm.Contains(2147483748)) // Integer.MAX_VALUE + 101
+       assert.False(t, bm.Contains(99))
+       assert.False(t, bm.Contains(102))
+}
+
+// Why: truncated payloads should fail cleanly before any slicing or decoding 
happens.
+// Condition: fewer than the minimum 12 bytes required for length, magic, and 
CRC.
+// Assertion: returns an error containing "too short".
+func TestDeserializeDVTooShort(t *testing.T) {
+       _, err := DeserializeDV([]byte{0, 1, 2}, -1)
+       assert.ErrorContains(t, err, "too short")
+}
+
+// Why: the DV envelope owns the outer length field and must reject mismatches.
+// Condition: payload has only the minimum bytes, but the encoded length says 
99.
+// Assertion: returns an error containing "length mismatch".
+func TestDeserializeDVBadLength(t *testing.T) {
+       data := make([]byte, dvMinSize)
+       binary.BigEndian.PutUint32(data[0:dvLengthSize], 99)
+
+       _, err := DeserializeDV(data, -1)
+       assert.ErrorContains(t, err, "length mismatch")
+}
+
+// Why: the reader must reject non-DV payloads even if the framing size looks 
valid.
+// Condition: encoded length is correct for a magic-only payload, but the 
magic value is not DVMagicNumber.
+// Assertion: returns an error containing "invalid deletion vector magic".
+func TestDeserializeDVBadMagic(t *testing.T) {
+       data := make([]byte, dvMinSize)
+       binary.BigEndian.PutUint32(data[0:dvLengthSize], dvMagicSize)
+       
binary.LittleEndian.PutUint32(data[dvLengthSize:dvLengthSize+dvMagicSize], 
0xFFFFFFFF)
+
+       _, err := DeserializeDV(data, -1)
+       assert.ErrorContains(t, err, "invalid deletion vector magic")
+}
+
+// Why: CRC is the DV format's corruption check and should fail before bitmap 
decoding.
+// Condition: start from a valid golden DV blob, then flip one byte in the 
stored CRC.
+// Assertion: returns an error containing "CRC mismatch".
+func TestDeserializeDVBadCRC(t *testing.T) {
+       data := append([]byte(nil), readDVTestData(t, 
"small-alternating-values-position-index.bin")...)
+       data[len(data)-1] ^= 0xFF
+
+       _, err := DeserializeDV(data, -1)
+       assert.ErrorContains(t, err, "CRC mismatch")
+}
+
+// Why: DV tests only need to prove that inner roaring decode failures are 
surfaced with DV context.
+// Condition: valid DV envelope containing invalid roaring bitmap bytes and a 
matching CRC.
+// Assertion: returns an error containing "deserialize deletion vector bitmap".
+func TestDeserializeDVWrapsBitmapDecodeError(t *testing.T) {
+       data := wrapDVPayloadForTest([]byte{0x00, 0x01, 0x02})
+
+       _, err := DeserializeDV(data, -1)
+       assert.ErrorContains(t, err, "deserialize deletion vector bitmap")
+}
+
+// Why: manifest metadata and DV content should agree on the number of deleted 
rows.
+// Condition: valid DV blob with 5 positions, but expected cardinality is set 
to 999.
+// Assertion: returns an error containing "cardinality mismatch".
+func TestDeserializeDVCardinalityMismatch(t *testing.T) {
+       data := readDVTestData(t, "small-alternating-values-position-index.bin")
+
+       _, err := DeserializeDV(data, 999)
+       assert.ErrorContains(t, err, "cardinality mismatch")
+}
+
+// Why: this is the production path that ties together Puffin I/O, blob range 
selection, and DV parsing.
+// Condition: real Puffin file containing one valid DV blob, with offset and 
size taken from Puffin metadata.
+// Assertion: no error, cardinality is 5, expected positions are present, and 
an unset position is absent.
+func TestReadDV(t *testing.T) {
+       dvBlobBytes := readDVTestData(t, 
"small-alternating-values-position-index.bin")
+
+       dir := t.TempDir()
+       path, meta := writePuffinWithDVBlob(t, dir, dvBlobBytes)
+
+       offset, size := meta.Offset, meta.Length
+       bm, err := ReadDV(iceio.LocalFS{}, newDVTestFile(path, 5, &offset, 
&size))
+       require.NoError(t, err)
+
+       assert.Equal(t, int64(5), bm.Cardinality())
+       assert.True(t, bm.Contains(1))
+       assert.True(t, bm.Contains(9))
+       assert.False(t, bm.Contains(2))
+}
+
+// Why: ReadDV should reject callers that pass the wrong file type before 
doing any I/O.
+// Condition: DataFile reports Parquet format instead of Puffin.
+// Assertion: returns an error containing "expected PUFFIN format".
+func TestReadDVWrongFormat(t *testing.T) {
+       _, err := ReadDV(iceio.LocalFS{}, &mockDVFile{
+               path:   "s3://bucket/data/pos-del.parquet",
+               format: iceberg.ParquetFile,
+       })
+       assert.ErrorContains(t, err, "expected PUFFIN format")
+}
+
+// Why: DV manifest entries require both content offset and content size to 
find the blob.
+// Condition: one subtest omits offset and another omits size.
+// Assertion: each case returns an error containing "missing 
ContentOffset/ContentSizeInBytes".
+func TestReadDVMissingContentMetadata(t *testing.T) {
+       t.Run("nil offset", func(t *testing.T) {
+               size := int64(50)
+
+               _, err := ReadDV(iceio.LocalFS{}, 
newDVTestFile("s3://bucket/data/dv.puffin", 0, nil, &size))
+               assert.ErrorContains(t, err, "missing 
ContentOffset/ContentSizeInBytes")
+       })
+
+       t.Run("nil size", func(t *testing.T) {
+               offset := int64(4)
+
+               _, err := ReadDV(iceio.LocalFS{}, 
newDVTestFile("s3://bucket/data/dv.puffin", 0, &offset, nil))
+               assert.ErrorContains(t, err, "missing 
ContentOffset/ContentSizeInBytes")
+       })
+}
+
+// Why: negative or absurdly large blob sizes should be rejected before 
allocation.
+// Condition: ContentSizeInBytes set to -1.
+// Assertion: returns an error containing "out of valid range".
+func TestReadDVInvalidBlobSize(t *testing.T) {
+       offset := int64(4)
+       negSize := int64(-1)
+
+       _, err := ReadDV(iceio.LocalFS{}, 
newDVTestFile("s3://bucket/data/dv.puffin", 0, &offset, &negSize))
+       assert.ErrorContains(t, err, "out of valid range")
+}
+
+// Why: storage open failures should be wrapped with file-path context by 
ReadDV.
+// Condition: custom IO implementation returns a fixed error from Open.
+// Assertion: error contains both "open DV file missing.puffin" and the 
underlying "boom" message.
+func TestReadDVOpenError(t *testing.T) {
+       offset, size := int64(4), int64(16)
+
+       _, err := ReadDV(failingOpenFS{err: errors.New("boom")}, 
newDVTestFile("missing.puffin", 0, &offset, &size))
+       assert.ErrorContains(t, err, "open DV file missing.puffin")
+       assert.ErrorContains(t, err, "boom")
+}
+
+// Why: ReadDV should surface Puffin container parse failures distinctly from 
DV parse failures.
+// Condition: file exists and can be opened, but its contents are not a valid 
Puffin file.
+// Assertion: returns an error containing "create puffin reader".
+func TestReadDVInvalidPuffin(t *testing.T) {
+       dir := t.TempDir()
+       path := filepath.Join(dir, "invalid.puffin")
+       require.NoError(t, os.WriteFile(path, []byte("not a puffin file"), 
0o644))
+
+       offset, size := int64(4), int64(16)
+       _, err := ReadDV(iceio.LocalFS{}, newDVTestFile(path, 0, &offset, 
&size))
+       assert.ErrorContains(t, err, "create puffin reader")
+}
+
+// Why: manifest-provided blob ranges must point into the Puffin blob area, 
not arbitrary offsets.
+// Condition: valid Puffin file, but content offset is forced to 0, which 
points before the blob region.
+// Assertion: returns an error containing "read DV blob at offset 0".
+func TestReadDVInvalidBlobRange(t *testing.T) {
+       dvBlobBytes := readDVTestData(t, 
"small-alternating-values-position-index.bin")
+
+       dir := t.TempDir()
+       path, meta := writePuffinWithDVBlob(t, dir, dvBlobBytes)
+
+       offset, size := int64(0), meta.Length
+       _, err := ReadDV(iceio.LocalFS{}, newDVTestFile(path, 5, &offset, 
&size))
+       assert.ErrorContains(t, err, "read DV blob at offset 0")
+}
diff --git a/table/dv/roaring_bitmap.go b/table/dv/roaring_bitmap.go
new file mode 100644
index 00000000..cb25844a
--- /dev/null
+++ b/table/dv/roaring_bitmap.go
@@ -0,0 +1,166 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+package dv
+
+import (
+       "bytes"
+       "encoding/binary"
+       "fmt"
+       "io"
+       "maps"
+       "slices"
+       "sort"
+
+       "github.com/RoaringBitmap/roaring/v2"
+       "github.com/apache/iceberg-go/puffin"
+)
+
+// maxBitmapCount is the maximum number of 32-bit bitmap keys allowed during
+// deserialization. This prevents CPU/memory exhaustion from absurd counts
+// in malformed input. Derived from puffin.DefaultMaxBlobSize / 8 (minimum
+// per-bitmap overhead: 4-byte key + at least 4 bytes of roaring data).
+var maxBitmapCount = uint64(puffin.DefaultMaxBlobSize / 8)
+
+// RoaringPositionBitmap supports 64-bit positions using a sparse map of
+// 32-bit Roaring bitmaps. Positions are split into a 32-bit key
+// (high bits) and 32-bit value (low bits).
+//
+// Compatible with the Java Iceberg RoaringPositionBitmap serialization format.
+type RoaringPositionBitmap struct {
+       bitmaps map[uint32]*roaring.Bitmap
+}
+
+// NewRoaringPositionBitmap creates an empty bitmap.
+func NewRoaringPositionBitmap() *RoaringPositionBitmap {
+       return &RoaringPositionBitmap{
+               bitmaps: make(map[uint32]*roaring.Bitmap),
+       }
+}
+
+// Set marks a position in the bitmap.
+func (b *RoaringPositionBitmap) Set(pos uint64) {
+       key := uint32(pos >> 32)
+       low := uint32(pos)
+       bm, ok := b.bitmaps[key]
+       if !ok {
+               bm = roaring.New()
+               b.bitmaps[key] = bm
+       }
+       bm.Add(low)
+}
+
+// Contains checks if a position is set.
+func (b *RoaringPositionBitmap) Contains(pos uint64) bool {
+       key := uint32(pos >> 32)
+       low := uint32(pos)
+       bm, ok := b.bitmaps[key]
+       if !ok {
+               return false
+       }
+
+       return bm.Contains(low)
+}
+
+// IsEmpty returns true if no positions are set.
+func (b *RoaringPositionBitmap) IsEmpty() bool {
+       return b.Cardinality() == 0
+}
+
+// Cardinality returns the total number of set positions.
+func (b *RoaringPositionBitmap) Cardinality() int64 {
+       var c int64
+       for _, bm := range b.bitmaps {
+               c += int64(bm.GetCardinality())
+       }
+
+       return c
+}
+
+// Serialize writes in the Iceberg portable format (little-endian):
+//   - bitmap count (8 bytes, LE): number of non-empty bitmaps
+//   - for each bitmap in ascending key order: key (4 bytes, LE) + roaring 
portable data
+//
+// Only non-empty bitmaps are written, matching Java Iceberg behavior.
+func (b *RoaringPositionBitmap) Serialize(w io.Writer) error {
+       keys := make([]uint32, 0, len(b.bitmaps))
+       for k, bm := range b.bitmaps {
+               if bm.GetCardinality() > 0 {
+                       keys = append(keys, k)
+               }
+       }
+       sort.Slice(keys, func(i, j int) bool { return keys[i] < keys[j] })
+
+       if err := binary.Write(w, binary.LittleEndian, int64(len(keys))); err 
!= nil {
+               return fmt.Errorf("write bitmap count: %w", err)
+       }
+       for _, key := range keys {
+               if err := binary.Write(w, binary.LittleEndian, key); err != nil 
{
+                       return fmt.Errorf("write key %d: %w", key, err)
+               }
+               if _, err := b.bitmaps[key].WriteTo(w); err != nil {
+                       return fmt.Errorf("write bitmap %d: %w", key, err)
+               }
+       }
+
+       return nil
+}
+
+// DeserializeRoaringPositionBitmap reads a bitmap from the Iceberg portable 
format.
+// Format: [count] { [key][bitmap] } .....{[key_n][bitmap_n]}
+func DeserializeRoaringPositionBitmap(data []byte) (*RoaringPositionBitmap, 
error) {
+       r := bytes.NewReader(data)
+
+       var count uint64
+       if err := binary.Read(r, binary.LittleEndian, &count); err != nil {
+               return nil, fmt.Errorf("read bitmap count: %w", err)
+       }
+       if count > maxBitmapCount {
+               return nil, fmt.Errorf("bitmap count %d exceeds maximum allowed 
%d", count, maxBitmapCount)
+       }
+
+       b := &RoaringPositionBitmap{
+               bitmaps: make(map[uint32]*roaring.Bitmap, count),
+       }
+       var lastKey uint32
+       hasLastKey := false
+
+       for i := range count {
+               var key uint32
+               if err := binary.Read(r, binary.LittleEndian, &key); err != nil 
{
+                       return nil, fmt.Errorf("read key %d: %w", i, err)
+               }
+               if hasLastKey && key <= lastKey {
+                       return nil, fmt.Errorf("keys must be ascending: got %d 
after %d", key, lastKey)
+               }
+
+               bm := roaring.New()
+               if _, err := bm.ReadFrom(r); err != nil {
+                       return nil, fmt.Errorf("read bitmap for key %d: %w", 
key, err)
+               }
+               b.bitmaps[key] = bm
+               lastKey = key
+               hasLastKey = true
+       }
+
+       return b, nil
+}
+
+// sortedKeys returns the bitmap keys in ascending order.
+func (b *RoaringPositionBitmap) sortedKeys() []uint32 {
+       return slices.Sorted(maps.Keys(b.bitmaps))
+}
diff --git a/table/dv/roaring_bitmap_test.go b/table/dv/roaring_bitmap_test.go
new file mode 100644
index 00000000..f060b7d4
--- /dev/null
+++ b/table/dv/roaring_bitmap_test.go
@@ -0,0 +1,219 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+package dv
+
+import (
+       "bytes"
+       "encoding/binary"
+       "testing"
+
+       "github.com/stretchr/testify/assert"
+       "github.com/stretchr/testify/require"
+)
+
+// Why: this type owns compatibility with Iceberg's Java roaring bitmap 
format, including the empty case.
+// Condition: deserialize a Java-produced bitmap with zero stored positions.
+// Assertion: no error, bitmap is empty, and cardinality is 0.
+func TestDeserializeRoaringBitmapJavaEmpty(t *testing.T) {
+       data := readDVTestData(t, "64mapempty.bin")
+
+       bm, err := DeserializeRoaringPositionBitmap(data)
+       require.NoError(t, err)
+
+       assert.True(t, bm.IsEmpty())
+       assert.Equal(t, int64(0), bm.Cardinality())
+}
+
+// Why: the bitmap layer owns 64-bit position mapping across multiple high-bit 
keys.
+// Condition: deserialize a Java-produced bitmap with keys 0..9 and low values 
0..9.
+// Assertion: no error, cardinality is 100, representative positions across 
keys exist, and a position beyond the stored range is absent.
+func TestDeserializeRoaringBitmapJavaSpreadValues(t *testing.T) {
+       data := readDVTestData(t, "64mapspreadvals.bin")
+
+       bm, err := DeserializeRoaringPositionBitmap(data)
+       require.NoError(t, err)
+
+       assert.Equal(t, int64(100), bm.Cardinality())
+       assert.True(t, bm.Contains(0))
+       assert.True(t, bm.Contains((uint64(3)<<32)|7))
+       assert.True(t, bm.Contains((uint64(9)<<32)|9))
+       assert.False(t, bm.Contains(uint64(10)<<32))
+}
+
+// Why: validates cross-impl compatibility for a simple 32-bit-only bitmap.
+// Condition: deserialize a Java-produced bitmap with positions [0..9] in a 
single key.
+// Assertion: no error, cardinality is 10, all expected positions present.
+func TestDeserializeRoaringBitmapJava32BitValues(t *testing.T) {
+       data := readDVTestData(t, "64map32bitvals.bin")
+
+       bm, err := DeserializeRoaringPositionBitmap(data)
+       require.NoError(t, err)
+
+       assert.Equal(t, int64(10), bm.Cardinality())
+       for i := uint64(0); i < 10; i++ {
+               assert.True(t, bm.Contains(i), "expected position %d to be 
set", i)
+       }
+       assert.False(t, bm.Contains(10))
+}
+
+// Why: the deserializer must fail cleanly when the outer bitmap count cannot 
be read.
+// Condition: empty input stream.
+// Assertion: returns an error containing "read bitmap count".
+func TestDeserializeRoaringBitmapTruncatedInput(t *testing.T) {
+       _, err := DeserializeRoaringPositionBitmap(nil)
+       assert.ErrorContains(t, err, "read bitmap count")
+}
+
+// Why: counts with the high bit set (e.g. an int64 -1 written to disk) decode
+// as a huge uint64 and must be rejected by the upper-bound check, not silently
+// accepted as a small value or panicked on by make(map, hugeHint).
+// Condition: count field encoded as int64(-1) (= 0xFFFF_FFFF_FFFF_FFFF on 
disk).
+// Assertion: returns an error containing "exceeds maximum".
+func TestDeserializeRoaringBitmapHighBitCount(t *testing.T) {
+       var buf bytes.Buffer
+       require.NoError(t, binary.Write(&buf, binary.LittleEndian, int64(-1)))
+
+       _, err := DeserializeRoaringPositionBitmap(buf.Bytes())
+       assert.ErrorContains(t, err, "exceeds maximum")
+}
+
+// Why: absurdly large counts should be rejected to prevent CPU/memory 
exhaustion.
+// Condition: count field set to maxBitmapCount + 1.
+// Assertion: returns an error containing "exceeds maximum".
+func TestDeserializeRoaringBitmapExcessiveCount(t *testing.T) {
+       var buf bytes.Buffer
+       require.NoError(t, binary.Write(&buf, binary.LittleEndian, 
maxBitmapCount+1))
+
+       _, err := DeserializeRoaringPositionBitmap(buf.Bytes())
+       assert.ErrorContains(t, err, "exceeds maximum")
+}
+
+// Why: each bitmap entry must start with a key; premature EOF before that key 
is a distinct decode failure.
+// Condition: count says 1, but no key bytes follow.
+// Assertion: returns an error containing "read key 0".
+func TestDeserializeRoaringBitmapTruncatedBeforeKey(t *testing.T) {
+       var buf bytes.Buffer
+       require.NoError(t, binary.Write(&buf, binary.LittleEndian, int64(1)))
+
+       _, err := DeserializeRoaringPositionBitmap(buf.Bytes())
+       assert.ErrorContains(t, err, "read key 0")
+}
+
+// Why: the on-wire format requires keys to be strictly ascending so the 
decoder can rebuild the sparse key space correctly.
+// Condition: encoded bitmap count is 2, but entries are written with key 5 
before key 3.
+// Assertion: returns an error containing "keys must be ascending".
+func TestDeserializeRoaringBitmapNonAscendingKeys(t *testing.T) {
+       bm := NewRoaringPositionBitmap()
+       bm.Set((uint64(5) << 32) | 1)
+       bm.Set((uint64(3) << 32) | 1)
+
+       var buf bytes.Buffer
+       require.NoError(t, binary.Write(&buf, binary.LittleEndian, int64(2)))
+       require.NoError(t, binary.Write(&buf, binary.LittleEndian, uint32(5)))
+       _, err := bm.bitmaps[5].WriteTo(&buf)
+       require.NoError(t, err)
+       require.NoError(t, binary.Write(&buf, binary.LittleEndian, uint32(3)))
+       _, err = bm.bitmaps[3].WriteTo(&buf)
+       require.NoError(t, err)
+
+       _, err = DeserializeRoaringPositionBitmap(buf.Bytes())
+       assert.ErrorContains(t, err, "keys must be ascending")
+}
+
+// Why: after a valid key is read, the decoder still needs a full roaring 
bitmap payload for that key.
+// Condition: count says 1 and key 0 is present, but no roaring bitmap bytes 
follow.
+// Assertion: returns an error containing "read bitmap for key 0".
+func TestDeserializeRoaringBitmapTruncatedAfterKey(t *testing.T) {
+       var buf bytes.Buffer
+       require.NoError(t, binary.Write(&buf, binary.LittleEndian, int64(1)))
+       require.NoError(t, binary.Write(&buf, binary.LittleEndian, uint32(0)))
+
+       _, err := DeserializeRoaringPositionBitmap(buf.Bytes())
+       assert.ErrorContains(t, err, "read bitmap for key 0")
+}
+
+// Why: Set, Contains, Cardinality, and gap handling are the core in-memory 
behaviors of this type.
+// Condition: set positions across keys 0, 1, and 3, leaving key 2 absent.
+// Assertion: cardinality counts all set positions, expected positions are 
present, and unset positions in the same key, a gap key, and a far key are 
absent.
+func TestRoaringBitmapSetContainsAndCardinality(t *testing.T) {
+       bm := NewRoaringPositionBitmap()
+
+       bm.Set(0)
+       bm.Set(42)
+       bm.Set(1000)
+       bm.Set((uint64(1) << 32) | 5)
+       bm.Set((uint64(1) << 32) | 999)
+       bm.Set((uint64(3) << 32) | 1)
+
+       assert.False(t, bm.IsEmpty())
+       assert.Equal(t, int64(6), bm.Cardinality())
+       assert.True(t, bm.Contains(0))
+       assert.True(t, bm.Contains((uint64(1)<<32)|999))
+       assert.True(t, bm.Contains((uint64(3)<<32)|1))
+       assert.False(t, bm.Contains(1))
+       assert.False(t, bm.Contains((uint64(1)<<32)|6))
+       assert.False(t, bm.Contains((uint64(2)<<32)|1))
+       assert.False(t, bm.Contains(uint64(100)<<32))
+}
+
+// Why: Serialize and DeserializeRoaringPositionBitmap together define the Go 
encoding contract for non-empty bitmaps.
+// Condition: round-trip a bitmap with positions spread across multiple keys 
and an internal key gap.
+// Assertion: serialization succeeds, deserialization succeeds, cardinality is 
preserved, and all original positions remain present.
+func TestRoaringBitmapSerializeRoundTrip(t *testing.T) {
+       bm := NewRoaringPositionBitmap()
+       positions := []uint64{
+               0,
+               1,
+               100,
+               65535,
+               (uint64(1) << 32) | 42,
+               (uint64(1) << 32) | 9999,
+               uint64(5) << 32,
+               (uint64(5) << 32) | 1,
+       }
+       for _, pos := range positions {
+               bm.Set(pos)
+       }
+
+       var buf bytes.Buffer
+       require.NoError(t, bm.Serialize(&buf))
+
+       got, err := DeserializeRoaringPositionBitmap(buf.Bytes())
+       require.NoError(t, err)
+
+       assert.Equal(t, bm.Cardinality(), got.Cardinality())
+       for _, pos := range positions {
+               assert.True(t, got.Contains(pos), "round-trip lost position 
%d", pos)
+       }
+}
+
+// Why: empty serialization is a separate boundary case because the on-wire 
count is zero and no key/bitmap pairs follow.
+// Condition: round-trip an empty bitmap through Serialize and 
DeserializeRoaringPositionBitmap.
+// Assertion: serialization succeeds, deserialization succeeds, bitmap is 
empty, and cardinality is 0.
+func TestRoaringBitmapEmptyRoundTrip(t *testing.T) {
+       bm := NewRoaringPositionBitmap()
+
+       var buf bytes.Buffer
+       require.NoError(t, bm.Serialize(&buf))
+
+       got, err := DeserializeRoaringPositionBitmap(buf.Bytes())
+       require.NoError(t, err)
+
+       assert.True(t, got.IsEmpty())
+       assert.Equal(t, int64(0), got.Cardinality())
+}
diff --git a/table/dv/testdata/deletes/64map32bitvals.bin 
b/table/dv/testdata/deletes/64map32bitvals.bin
new file mode 100644
index 00000000..475b8944
Binary files /dev/null and b/table/dv/testdata/deletes/64map32bitvals.bin differ
diff --git a/table/dv/testdata/deletes/64mapempty.bin 
b/table/dv/testdata/deletes/64mapempty.bin
new file mode 100644
index 00000000..1b1cb4d4
Binary files /dev/null and b/table/dv/testdata/deletes/64mapempty.bin differ
diff --git a/table/dv/testdata/deletes/64mapspreadvals.bin 
b/table/dv/testdata/deletes/64mapspreadvals.bin
new file mode 100644
index 00000000..83c72f6b
Binary files /dev/null and b/table/dv/testdata/deletes/64mapspreadvals.bin 
differ
diff --git a/table/dv/testdata/deletes/README.md 
b/table/dv/testdata/deletes/README.md
new file mode 100644
index 00000000..03b89aed
--- /dev/null
+++ b/table/dv/testdata/deletes/README.md
@@ -0,0 +1,21 @@
+<!--
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied.  See the License for the
+specific language governing permissions and limitations
+under the License.
+-->
+
+These test fixture files are canonical deletion vector and roaring bitmap 
files from the Apache Iceberg Java implementation:
+https://github.com/apache/iceberg/tree/main/core/src/test/resources/org/apache/iceberg/deletes
diff --git a/table/dv/testdata/deletes/all-container-types-position-index.bin 
b/table/dv/testdata/deletes/all-container-types-position-index.bin
new file mode 100644
index 00000000..00d47303
Binary files /dev/null and 
b/table/dv/testdata/deletes/all-container-types-position-index.bin differ
diff --git a/table/dv/testdata/deletes/empty-position-index.bin 
b/table/dv/testdata/deletes/empty-position-index.bin
new file mode 100644
index 00000000..8bbc1265
Binary files /dev/null and b/table/dv/testdata/deletes/empty-position-index.bin 
differ
diff --git 
a/table/dv/testdata/deletes/small-alternating-values-position-index.bin 
b/table/dv/testdata/deletes/small-alternating-values-position-index.bin
new file mode 100644
index 00000000..80829fae
Binary files /dev/null and 
b/table/dv/testdata/deletes/small-alternating-values-position-index.bin differ
diff --git 
a/table/dv/testdata/deletes/small-and-large-values-position-index.bin 
b/table/dv/testdata/deletes/small-and-large-values-position-index.bin
new file mode 100644
index 00000000..989dabf6
Binary files /dev/null and 
b/table/dv/testdata/deletes/small-and-large-values-position-index.bin differ
diff --git a/table/dv_scan_planning_test.go b/table/dv_scan_planning_test.go
index fe6ebe68..93278369 100644
--- a/table/dv_scan_planning_test.go
+++ b/table/dv_scan_planning_test.go
@@ -24,7 +24,8 @@ import (
        "github.com/stretchr/testify/assert"
 )
 
-// dvMockDataFile extends mockDataFile with DV fields.
+// dvMockDataFile extends mockDataFile with the DV-specific fields
+// (referenced data file, content offset/size) needed by scan planning tests.
 type dvMockDataFile struct {
        mockDataFile
        referencedDataFile *string


Reply via email to