laskoviymishka commented on code in PR #1131:
URL: https://github.com/apache/iceberg-go/pull/1131#discussion_r3306911669


##########
table/dv/dv_cross_client_test.go:
##########
@@ -0,0 +1,157 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+package dv
+
+import (
+       "context"
+       "encoding/binary"
+       "hash/crc32"
+       "testing"
+
+       "github.com/apache/iceberg-go"
+       iceio "github.com/apache/iceberg-go/io"
+       "github.com/apache/iceberg-go/puffin"
+       "github.com/stretchr/testify/assert"
+       "github.com/stretchr/testify/require"
+)
+
+// TestCrossClientDVWriterProducesSpecCompliantBlob validates that DVWriter
+// output conforms to the DV spec envelope format and can be parsed by
+// any spec-compliant reader (Java, pyiceberg, iceberg-rust).
+//
+// Format verified:
+//   - Puffin file structure (magic + blobs + footer)
+//   - Blob type is "deletion-vector-v1"
+//   - snapshot-id = -1, sequence-number = -1
+//   - Blob properties include "cardinality" and "referenced-data-file"
+//   - Inner DV envelope: [length][magic 0xD1D33964][bitmap][CRC32]
+func TestCrossClientDVWriterProducesSpecCompliantBlob(t *testing.T) {

Review Comment:
   This is Go-writes-Go-reads — `NewDVWriter` produces the file, 
`puffin.NewReader` + `DeserializeDV` reads it back, and the cardinality is 
checked against `df.Count()` which is itself Go-derived. There's no 
Java/PyIceberg/Rust artifact in the loop, so a Puffin-layer regression in any 
of those clients wouldn't be caught. `TestDVWriterSingleDataFile` + 
`verifyDVReadBack` in `dv_writer_test.go` already covers the roundtrip.
   
   What this test could be doing — and what would actually earn the 
"cross-client" name — is reading a Java-produced single-blob Puffin file from 
`testdata/` via `ReadDV`, then asserting on cardinality, referenced-data-file, 
and bitmap contents. The testdata directory has Java-produced DV envelopes 
(`.bin`) but no Java-produced Puffin file; checking one in here is the unlock.



##########
table/dv/dv_cross_client_test.go:
##########
@@ -0,0 +1,157 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+package dv
+
+import (
+       "context"
+       "encoding/binary"
+       "hash/crc32"
+       "testing"
+
+       "github.com/apache/iceberg-go"
+       iceio "github.com/apache/iceberg-go/io"
+       "github.com/apache/iceberg-go/puffin"
+       "github.com/stretchr/testify/assert"
+       "github.com/stretchr/testify/require"
+)
+
+// TestCrossClientDVWriterProducesSpecCompliantBlob validates that DVWriter
+// output conforms to the DV spec envelope format and can be parsed by
+// any spec-compliant reader (Java, pyiceberg, iceberg-rust).
+//
+// Format verified:
+//   - Puffin file structure (magic + blobs + footer)
+//   - Blob type is "deletion-vector-v1"
+//   - snapshot-id = -1, sequence-number = -1
+//   - Blob properties include "cardinality" and "referenced-data-file"
+//   - Inner DV envelope: [length][magic 0xD1D33964][bitmap][CRC32]
+func TestCrossClientDVWriterProducesSpecCompliantBlob(t *testing.T) {
+       fs := iceio.NewMemFS()
+       w := NewDVWriter(fs)
+
+       dataPath := "s3://warehouse/db/table/data/00000-0-abc.parquet"
+       positions := []int64{1, 3, 5, 7, 9}
+       w.Add(dataPath, positions)
+
+       location := "mem://xc/dv-test.puffin"
+       dataFiles, err := w.Flush(context.Background(), location)
+       require.NoError(t, err)
+       require.Len(t, dataFiles, 1)
+
+       df := dataFiles[0]
+       assert.Equal(t, iceberg.PuffinFile, df.FileFormat())
+       assert.Equal(t, iceberg.EntryContentPosDeletes, df.ContentType())
+       assert.Equal(t, int64(5), df.Count())
+       assert.Equal(t, dataPath, *df.ReferencedDataFile())
+
+       f, err := fs.Open(location)
+       require.NoError(t, err)
+       defer f.Close()
+
+       reader, err := puffin.NewReader(f)
+       require.NoError(t, err)
+
+       blobs := reader.Blobs()
+       require.Len(t, blobs, 1)
+
+       blob := blobs[0]
+       assert.Equal(t, puffin.BlobTypeDeletionVector, blob.Type)
+       assert.Equal(t, int64(-1), blob.SnapshotID)
+       assert.Equal(t, int64(-1), blob.SequenceNumber)
+       assert.Equal(t, "5", blob.Properties["cardinality"])
+       assert.Equal(t, dataPath, blob.Properties["referenced-data-file"])
+
+       blobData, err := reader.ReadBlob(0)
+       require.NoError(t, err)
+
+       verifyDVEnvelopeFormat(t, blobData.Data)
+
+       bm, err := DeserializeDV(blobData.Data, 5)
+       require.NoError(t, err)
+       assert.Equal(t, int64(5), bm.Cardinality())
+       for _, pos := range positions {
+               assert.True(t, bm.Contains(uint64(pos)), "position %d should be 
set", pos)
+       }
+}
+
+// TestCrossClientGoSerializeMatchesJavaFormat verifies that Go's SerializeDV
+// produces byte-identical output to a Java-produced DV fixture for the same
+// input positions. Byte equivalence is a stronger cross-client guarantee than
+// roundtrip-readability: any spec-compliant reader will accept Go's output,
+// and Go's output is indistinguishable from Java's at the wire level.
+func TestCrossClientGoSerializeMatchesJavaFormat(t *testing.T) {
+       javaBytes := readDVTestData(t, 
"small-alternating-values-position-index.bin")
+
+       bm := NewRoaringPositionBitmap()
+       bm.Set(1)
+       bm.Set(3)
+       bm.Set(5)
+       bm.Set(7)
+       bm.Set(9)
+
+       goBytes, err := SerializeDV(bm)
+       require.NoError(t, err)
+
+       assert.Equal(t, javaBytes, goBytes,
+               "Go-serialized DV must be byte-identical to Java fixture for 
the same positions")
+}
+
+// TestCrossClientMultiFileDVPuffin validates that a multi-blob Puffin file
+// (one blob per data file) has correct offsets and each blob is independently
+// readable — matching how Java writes multi-DV Puffin files.
+func TestCrossClientMultiFileDVPuffin(t *testing.T) {

Review Comment:
   Same shape as the comment on 
`TestCrossClientDVWriterProducesSpecCompliantBlob` — Go writes the multi-blob 
Puffin, Go reads it back, asserts cardinality against `df.Count()`. 
`TestDVWriterMultipleDataFiles` already covers this. To actually be a 
cross-client test, this should consume a Java-produced multi-blob Puffin from 
`testdata/` and check that `ReadDV` recovers the right bitmap for each 
`DataFile`. The Puffin footer layout for multi-blob files is exactly the place 
where small encoding choices (offset encoding, blob-meta ordering) cause silent 
corruption across clients.



##########
table/dv/dv_cross_client_test.go:
##########
@@ -0,0 +1,157 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+package dv
+
+import (
+       "context"
+       "encoding/binary"
+       "hash/crc32"
+       "testing"
+
+       "github.com/apache/iceberg-go"
+       iceio "github.com/apache/iceberg-go/io"
+       "github.com/apache/iceberg-go/puffin"
+       "github.com/stretchr/testify/assert"
+       "github.com/stretchr/testify/require"
+)
+
+// TestCrossClientDVWriterProducesSpecCompliantBlob validates that DVWriter
+// output conforms to the DV spec envelope format and can be parsed by
+// any spec-compliant reader (Java, pyiceberg, iceberg-rust).
+//
+// Format verified:
+//   - Puffin file structure (magic + blobs + footer)
+//   - Blob type is "deletion-vector-v1"
+//   - snapshot-id = -1, sequence-number = -1
+//   - Blob properties include "cardinality" and "referenced-data-file"
+//   - Inner DV envelope: [length][magic 0xD1D33964][bitmap][CRC32]
+func TestCrossClientDVWriterProducesSpecCompliantBlob(t *testing.T) {
+       fs := iceio.NewMemFS()
+       w := NewDVWriter(fs)
+
+       dataPath := "s3://warehouse/db/table/data/00000-0-abc.parquet"
+       positions := []int64{1, 3, 5, 7, 9}
+       w.Add(dataPath, positions)
+
+       location := "mem://xc/dv-test.puffin"
+       dataFiles, err := w.Flush(context.Background(), location)
+       require.NoError(t, err)
+       require.Len(t, dataFiles, 1)
+
+       df := dataFiles[0]
+       assert.Equal(t, iceberg.PuffinFile, df.FileFormat())
+       assert.Equal(t, iceberg.EntryContentPosDeletes, df.ContentType())
+       assert.Equal(t, int64(5), df.Count())
+       assert.Equal(t, dataPath, *df.ReferencedDataFile())
+
+       f, err := fs.Open(location)
+       require.NoError(t, err)
+       defer f.Close()
+
+       reader, err := puffin.NewReader(f)
+       require.NoError(t, err)
+
+       blobs := reader.Blobs()
+       require.Len(t, blobs, 1)
+
+       blob := blobs[0]
+       assert.Equal(t, puffin.BlobTypeDeletionVector, blob.Type)
+       assert.Equal(t, int64(-1), blob.SnapshotID)
+       assert.Equal(t, int64(-1), blob.SequenceNumber)
+       assert.Equal(t, "5", blob.Properties["cardinality"])
+       assert.Equal(t, dataPath, blob.Properties["referenced-data-file"])
+
+       blobData, err := reader.ReadBlob(0)
+       require.NoError(t, err)
+
+       verifyDVEnvelopeFormat(t, blobData.Data)
+
+       bm, err := DeserializeDV(blobData.Data, 5)
+       require.NoError(t, err)
+       assert.Equal(t, int64(5), bm.Cardinality())
+       for _, pos := range positions {
+               assert.True(t, bm.Contains(uint64(pos)), "position %d should be 
set", pos)
+       }
+}
+
+// TestCrossClientGoSerializeMatchesJavaFormat verifies that Go's SerializeDV
+// produces byte-identical output to a Java-produced DV fixture for the same
+// input positions. Byte equivalence is a stronger cross-client guarantee than
+// roundtrip-readability: any spec-compliant reader will accept Go's output,
+// and Go's output is indistinguishable from Java's at the wire level.
+func TestCrossClientGoSerializeMatchesJavaFormat(t *testing.T) {

Review Comment:
   This is the only test that compares against an external artifact, so it's 
pulling a lot of weight — but the artifact is the inner DV envelope, not a 
Puffin file, and the fixture `[1,3,5,7,9]` is exactly the case that's easiest 
to byte-match: scattered low-cardinality positions, single roaring array 
container, no run encoding triggered on either side.
   
   That makes the comment overstate the guarantee in two ways. First, it's 
silent on the Puffin layer (where most cross-client bugs live). Second, 
byte-equality doesn't generalize — Java's 
`BitmapPositionDeleteIndex.serialize()` calls `runLengthEncode()` before 
writing and Go's `RoaringPositionBitmap.Serialize()` doesn't, so for 
range-style deletes the two outputs diverge bytewise while both remaining 
spec-compliant.
   
   I'd scope this test to what it actually proves (single-container DV-envelope 
parity for this fixture) and add coverage at two new axes:
   
   - A Java-produced **Puffin** fixture (not just an inner envelope) read 
end-to-end through `ReadDV`.
   - At least one DV-envelope fixture that crosses the 4096-entry array→bitmap 
threshold, and ideally one >2^32 to force a second roaring key — the 
container/key boundaries are where encoding differences actually surface.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to