This is an automated email from the ASF dual-hosted git repository.

zeroshade pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/iceberg-go.git


The following commit(s) were added to refs/heads/main by this push:
     new f8b8ee9e feat(puffin): enforce required DV blob properties on write 
(#1060)
f8b8ee9e is described below

commit f8b8ee9e98e577601f9f900f0bfc653d51be5017
Author: Andrei Tserakhau <[email protected]>
AuthorDate: Sun May 17 19:43:04 2026 +0200

    feat(puffin): enforce required DV blob properties on write (#1060)
    
    Closes #1057. Puffin spec requires `deletion-vector-v1` blobs to carry
    both `cardinality` and `referenced-data-file` properties. AddBlob
    already enforces the snapshot-id/sequence-number invariants for DV
    blobs; extend the same branch to reject blobs missing either property.
    
    Closes the gap that prompted the reader-side slog.Warn fallback in #1049
    — once writers can't omit the property, the warning path stays
    unreachable from in-tree writes and only fires on cross-impl reads of
    files written by third-party tools.
    
    Two existing test fixtures that wrote DV blobs without the cardinality
    property are updated: TestRoundTrip in puffin_test.go and
    writePuffinWithDVBlob in dv's deletion_vector_test.go.
---
 puffin/dv_header_validation_test.go |  4 +++
 puffin/puffin_test.go               | 72 ++++++++++++++++++++++++++++++++++++-
 puffin/puffin_writer.go             | 20 +++++++++++
 table/dv/deletion_vector_test.go    | 44 ++++++++---------------
 4 files changed, 110 insertions(+), 30 deletions(-)

diff --git a/puffin/dv_header_validation_test.go 
b/puffin/dv_header_validation_test.go
index 7975bfec..090cbf3a 100644
--- a/puffin/dv_header_validation_test.go
+++ b/puffin/dv_header_validation_test.go
@@ -125,6 +125,10 @@ func TestReaderRejectsDeletionVectorWithBadHeader(t 
*testing.T) {
                        SnapshotID:     -1,
                        SequenceNumber: -1,
                        Fields:         []int32{rowPositionFieldID},
+                       Properties: map[string]string{
+                               "cardinality":          "0",
+                               "referenced-data-file": "data/x.parquet",
+                       },
                }, []byte("payload"))
                require.NoError(t, err)
                require.NoError(t, w.Finish())
diff --git a/puffin/puffin_test.go b/puffin/puffin_test.go
index 2752e15f..34b8be56 100644
--- a/puffin/puffin_test.go
+++ b/puffin/puffin_test.go
@@ -92,7 +92,7 @@ func TestRoundTrip(t *testing.T) {
                SnapshotID:     -1,
                SequenceNumber: -1,
                Fields:         []int32{},
-               Properties:     map[string]string{"referenced-data-file": 
"data/file.parquet"},
+               Properties:     map[string]string{"referenced-data-file": 
"data/file.parquet", "cardinality": "0"},
        }, blob2Data)
        require.NoError(t, err)
        require.NoError(t, w.Finish())
@@ -237,6 +237,76 @@ func TestWriterValidation(t *testing.T) {
                }, []byte("x"))
                assert.ErrorContains(t, err, "sequence-number")
        })
+
+       // deletion vector missing cardinality property: spec requires both
+       // cardinality and referenced-data-file; the table/dv reader does not
+       // enforce them, so we keep Go-emitted files always conformant here.
+       t.Run("deletion vector missing cardinality property", func(t 
*testing.T) {
+               w, _ := newWriter()
+               _, err := w.AddBlob(puffin.BlobMetadataInput{
+                       Type: puffin.BlobTypeDeletionVector, SnapshotID: -1, 
SequenceNumber: -1, Fields: []int32{},
+                       Properties: map[string]string{"referenced-data-file": 
"data/x.parquet"},
+               }, []byte("x"))
+               assert.ErrorContains(t, err, "cardinality")
+       })
+
+       // deletion vector empty-string cardinality: caller explicitly set the
+       // key to ""; treated identically to "absent" since "" isn't a valid
+       // cardinality value either.
+       t.Run("deletion vector empty cardinality value", func(t *testing.T) {
+               w, _ := newWriter()
+               _, err := w.AddBlob(puffin.BlobMetadataInput{
+                       Type: puffin.BlobTypeDeletionVector, SnapshotID: -1, 
SequenceNumber: -1, Fields: []int32{},
+                       Properties: map[string]string{"cardinality": "", 
"referenced-data-file": "data/x.parquet"},
+               }, []byte("x"))
+               assert.ErrorContains(t, err, "cardinality")
+       })
+
+       // deletion vector non-numeric cardinality: caller passed gibberish.
+       // Caught at write time via ParseUint so the reader doesn't surface
+       // it later as a cryptic parse error.
+       t.Run("deletion vector non-numeric cardinality", func(t *testing.T) {
+               w, _ := newWriter()
+               _, err := w.AddBlob(puffin.BlobMetadataInput{
+                       Type: puffin.BlobTypeDeletionVector, SnapshotID: -1, 
SequenceNumber: -1, Fields: []int32{},
+                       Properties: map[string]string{"cardinality": 
"not-a-number", "referenced-data-file": "data/x.parquet"},
+               }, []byte("x"))
+               assert.ErrorContains(t, err, "not a valid non-negative integer")
+       })
+
+       // deletion vector negative cardinality: -1 aliases dv.DeserializeDV's
+       // skip-validation sentinel, so accepting it on the write side would
+       // let writers silently disable the read-side check. ParseUint catches
+       // this via the same invalid-syntax error path as non-numeric input —
+       // one validation, both cases.
+       t.Run("deletion vector negative cardinality", func(t *testing.T) {
+               w, _ := newWriter()
+               _, err := w.AddBlob(puffin.BlobMetadataInput{
+                       Type: puffin.BlobTypeDeletionVector, SnapshotID: -1, 
SequenceNumber: -1, Fields: []int32{},
+                       Properties: map[string]string{"cardinality": "-1", 
"referenced-data-file": "data/x.parquet"},
+               }, []byte("x"))
+               assert.ErrorContains(t, err, "not a valid non-negative integer")
+       })
+
+       // deletion vector missing referenced-data-file property: spec-mandated.
+       t.Run("deletion vector missing referenced-data-file property", func(t 
*testing.T) {
+               w, _ := newWriter()
+               _, err := w.AddBlob(puffin.BlobMetadataInput{
+                       Type: puffin.BlobTypeDeletionVector, SnapshotID: -1, 
SequenceNumber: -1, Fields: []int32{},
+                       Properties: map[string]string{"cardinality": "0"},
+               }, []byte("x"))
+               assert.ErrorContains(t, err, "referenced-data-file")
+       })
+
+       // deletion vector with all required properties succeeds.
+       t.Run("deletion vector with all required properties", func(t 
*testing.T) {
+               w, _ := newWriter()
+               _, err := w.AddBlob(puffin.BlobMetadataInput{
+                       Type: puffin.BlobTypeDeletionVector, SnapshotID: -1, 
SequenceNumber: -1, Fields: []int32{},
+                       Properties: map[string]string{"cardinality": "0", 
"referenced-data-file": "data/x.parquet"},
+               }, []byte("x"))
+               require.NoError(t, err)
+       })
 }
 
 // TestSetCreatedBy verifies the SetCreatedBy method.
diff --git a/puffin/puffin_writer.go b/puffin/puffin_writer.go
index 8083a8b3..14db49f2 100644
--- a/puffin/puffin_writer.go
+++ b/puffin/puffin_writer.go
@@ -23,6 +23,7 @@ import (
        "fmt"
        "io"
        "math"
+       "strconv"
 
        "github.com/apache/iceberg-go"
 )
@@ -141,6 +142,25 @@ func (w *Writer) AddBlob(input BlobMetadataInput, data 
[]byte) (BlobMetadata, er
                        return BlobMetadata{}, fmt.Errorf("puffin: %s requires 
sequence-number to be -1, got %d",
                                BlobTypeDeletionVector, input.SequenceNumber)
                }
+               // Properties cardinality and referenced-data-file are 
spec-mandated
+               // for deletion-vector-v1; the reader in table/dv does not 
enforce
+               // them today, so we hold the line here to keep Go-emitted files
+               // always conformant.
+               if input.Properties["cardinality"] == "" {
+                       return BlobMetadata{}, errors.New("puffin: 
deletion-vector-v1 requires a cardinality property")
+               }
+               // Reject non-numeric or negative values at write time too — 
otherwise
+               // a writer could emit "cardinality": "abc" or "-1" that the 
reader
+               // hard-rejects later. ParseUint covers both: "-1" fails as 
invalid
+               // syntax (the minus is rejected before any value is parsed), so
+               // non-numeric and negative collapse into one error path.
+               if _, err := strconv.ParseUint(input.Properties["cardinality"], 
10, 64); err != nil {
+                       return BlobMetadata{}, fmt.Errorf("puffin: 
deletion-vector-v1 cardinality property %q is not a valid non-negative integer: 
%w",
+                               input.Properties["cardinality"], err)
+               }
+               if input.Properties["referenced-data-file"] == "" {
+                       return BlobMetadata{}, errors.New("puffin: 
deletion-vector-v1 requires a referenced-data-file property")
+               }
        }
 
        meta := BlobMetadata{
diff --git a/table/dv/deletion_vector_test.go b/table/dv/deletion_vector_test.go
index e6e68470..34d3a12c 100644
--- a/table/dv/deletion_vector_test.go
+++ b/table/dv/deletion_vector_test.go
@@ -93,8 +93,13 @@ func wrapDVPayloadForTest(bitmapBytes []byte) []byte {
 }
 
 func writePuffinWithDVBlob(t *testing.T, dir string, dvBlobBytes []byte) 
(string, puffin.BlobMetadata) {
+       // Callers of this wrapper all use the canonical 5-position fixture
+       // (small-alternating-values-position-index.bin), so hardcoding "5" is
+       // safe. Tests that exercise other bitmap sizes call
+       // writePuffinWithDVBlobAndProps directly with their own cardinality.
        return writePuffinWithDVBlobAndProps(t, dir, dvBlobBytes, 
map[string]string{
                "referenced-data-file": "s3://bucket/data/data-001.parquet",
+               "cardinality":          "5",
        })
 }
 
@@ -437,35 +442,16 @@ func TestReadDVCardinalityValidation(t *testing.T) {
                assert.Equal(t, int64(5), bm.Cardinality())
        })
 
-       t.Run("malformed property is rejected", func(t *testing.T) {
-               // Property present but unparseable — silently falling back to 
-1
-               // would mask writer bugs, so this is a hard error.
-               dir := t.TempDir()
-               path, meta := writePuffinWithDVBlobAndProps(t, dir, 
dvBlobBytes, map[string]string{
-                       "referenced-data-file": 
"s3://bucket/data/data-001.parquet",
-                       "cardinality":          "not-a-number",
-               })
-               offset, size := meta.Offset, meta.Length
-               _, err := ReadDV(iceio.LocalFS{}, newDVTestFile(path, 5, 
&offset, &size))
-               require.Error(t, err)
-               assert.Contains(t, err.Error(), "invalid cardinality")
-       })
-
-       t.Run("negative cardinality is rejected (not silently treated as 
skip-validation sentinel)", func(t *testing.T) {
-               // -1 happens to alias DeserializeDV's internal skip-validation
-               // sentinel; accepting it from blob properties would silently
-               // disable the very check this PR adds. The pre-pass rejects all
-               // negative values.
-               dir := t.TempDir()
-               path, meta := writePuffinWithDVBlobAndProps(t, dir, 
dvBlobBytes, map[string]string{
-                       "referenced-data-file": 
"s3://bucket/data/data-001.parquet",
-                       "cardinality":          "-1",
-               })
-               offset, size := meta.Offset, meta.Length
-               _, err := ReadDV(iceio.LocalFS{}, newDVTestFile(path, 5, 
&offset, &size))
-               require.Error(t, err)
-               assert.Contains(t, err.Error(), "must be non-negative")
-       })
+       // Note: the previous "malformed property is rejected" and "negative
+       // cardinality rejected on read" subtests were removed once the puffin
+       // writer started rejecting non-numeric and negative cardinality values
+       // at AddBlob time (via ParseUint). The reader-side checks in
+       // blobCardinality stay as belt-and-suspenders for third-party-written
+       // files but are no longer reachable from any in-tree writer path used
+       // to construct test fixtures. Writer-side coverage:
+       //   TestWriterValidation/deletion_vector_non-numeric_cardinality
+       //   TestWriterValidation/deletion_vector_negative_cardinality
+       // (both in puffin_test.go).
 
        t.Run("manifest offset not found in footer is rejected with a precise 
error", func(t *testing.T) {
                // Manifest entry points at an in-bounds offset that doesn't 
match

Reply via email to