This is an automated email from the ASF dual-hosted git repository.
zeroshade pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/iceberg-go.git
The following commit(s) were added to refs/heads/main by this push:
new f8b8ee9e feat(puffin): enforce required DV blob properties on write
(#1060)
f8b8ee9e is described below
commit f8b8ee9e98e577601f9f900f0bfc653d51be5017
Author: Andrei Tserakhau <[email protected]>
AuthorDate: Sun May 17 19:43:04 2026 +0200
feat(puffin): enforce required DV blob properties on write (#1060)
Closes #1057. Puffin spec requires `deletion-vector-v1` blobs to carry
both `cardinality` and `referenced-data-file` properties. AddBlob
already enforces the snapshot-id/sequence-number invariants for DV
blobs; extend the same branch to reject blobs missing either property.
Closes the gap that prompted the reader-side slog.Warn fallback in #1049
— once writers can't omit the property, the warning path stays
unreachable from in-tree writes and only fires on cross-impl reads of
files written by third-party tools.
Two existing test fixtures that wrote DV blobs without the cardinality
property are updated: TestRoundTrip in puffin_test.go and
writePuffinWithDVBlob in dv's deletion_vector_test.go.
---
puffin/dv_header_validation_test.go | 4 +++
puffin/puffin_test.go | 72 ++++++++++++++++++++++++++++++++++++-
puffin/puffin_writer.go | 20 +++++++++++
table/dv/deletion_vector_test.go | 44 ++++++++---------------
4 files changed, 110 insertions(+), 30 deletions(-)
diff --git a/puffin/dv_header_validation_test.go
b/puffin/dv_header_validation_test.go
index 7975bfec..090cbf3a 100644
--- a/puffin/dv_header_validation_test.go
+++ b/puffin/dv_header_validation_test.go
@@ -125,6 +125,10 @@ func TestReaderRejectsDeletionVectorWithBadHeader(t
*testing.T) {
SnapshotID: -1,
SequenceNumber: -1,
Fields: []int32{rowPositionFieldID},
+ Properties: map[string]string{
+ "cardinality": "0",
+ "referenced-data-file": "data/x.parquet",
+ },
}, []byte("payload"))
require.NoError(t, err)
require.NoError(t, w.Finish())
diff --git a/puffin/puffin_test.go b/puffin/puffin_test.go
index 2752e15f..34b8be56 100644
--- a/puffin/puffin_test.go
+++ b/puffin/puffin_test.go
@@ -92,7 +92,7 @@ func TestRoundTrip(t *testing.T) {
SnapshotID: -1,
SequenceNumber: -1,
Fields: []int32{},
- Properties: map[string]string{"referenced-data-file":
"data/file.parquet"},
+ Properties: map[string]string{"referenced-data-file":
"data/file.parquet", "cardinality": "0"},
}, blob2Data)
require.NoError(t, err)
require.NoError(t, w.Finish())
@@ -237,6 +237,76 @@ func TestWriterValidation(t *testing.T) {
}, []byte("x"))
assert.ErrorContains(t, err, "sequence-number")
})
+
+ // deletion vector missing cardinality property: spec requires both
+ // cardinality and referenced-data-file; the table/dv reader does not
+ // enforce them, so we keep Go-emitted files always conformant here.
+ t.Run("deletion vector missing cardinality property", func(t
*testing.T) {
+ w, _ := newWriter()
+ _, err := w.AddBlob(puffin.BlobMetadataInput{
+ Type: puffin.BlobTypeDeletionVector, SnapshotID: -1,
SequenceNumber: -1, Fields: []int32{},
+ Properties: map[string]string{"referenced-data-file":
"data/x.parquet"},
+ }, []byte("x"))
+ assert.ErrorContains(t, err, "cardinality")
+ })
+
+ // deletion vector empty-string cardinality: caller explicitly set the
+ // key to ""; treated identically to "absent" since "" isn't a valid
+ // cardinality value either.
+ t.Run("deletion vector empty cardinality value", func(t *testing.T) {
+ w, _ := newWriter()
+ _, err := w.AddBlob(puffin.BlobMetadataInput{
+ Type: puffin.BlobTypeDeletionVector, SnapshotID: -1,
SequenceNumber: -1, Fields: []int32{},
+ Properties: map[string]string{"cardinality": "",
"referenced-data-file": "data/x.parquet"},
+ }, []byte("x"))
+ assert.ErrorContains(t, err, "cardinality")
+ })
+
+ // deletion vector non-numeric cardinality: caller passed gibberish.
+ // Caught at write time via ParseUint so the reader doesn't surface
+ // it later as a cryptic parse error.
+ t.Run("deletion vector non-numeric cardinality", func(t *testing.T) {
+ w, _ := newWriter()
+ _, err := w.AddBlob(puffin.BlobMetadataInput{
+ Type: puffin.BlobTypeDeletionVector, SnapshotID: -1,
SequenceNumber: -1, Fields: []int32{},
+ Properties: map[string]string{"cardinality":
"not-a-number", "referenced-data-file": "data/x.parquet"},
+ }, []byte("x"))
+ assert.ErrorContains(t, err, "not a valid non-negative integer")
+ })
+
+ // deletion vector negative cardinality: -1 aliases dv.DeserializeDV's
+ // skip-validation sentinel, so accepting it on the write side would
+ // let writers silently disable the read-side check. ParseUint catches
+ // this via the same invalid-syntax error path as non-numeric input —
+ // one validation, both cases.
+ t.Run("deletion vector negative cardinality", func(t *testing.T) {
+ w, _ := newWriter()
+ _, err := w.AddBlob(puffin.BlobMetadataInput{
+ Type: puffin.BlobTypeDeletionVector, SnapshotID: -1,
SequenceNumber: -1, Fields: []int32{},
+ Properties: map[string]string{"cardinality": "-1",
"referenced-data-file": "data/x.parquet"},
+ }, []byte("x"))
+ assert.ErrorContains(t, err, "not a valid non-negative integer")
+ })
+
+ // deletion vector missing referenced-data-file property: spec-mandated.
+ t.Run("deletion vector missing referenced-data-file property", func(t
*testing.T) {
+ w, _ := newWriter()
+ _, err := w.AddBlob(puffin.BlobMetadataInput{
+ Type: puffin.BlobTypeDeletionVector, SnapshotID: -1,
SequenceNumber: -1, Fields: []int32{},
+ Properties: map[string]string{"cardinality": "0"},
+ }, []byte("x"))
+ assert.ErrorContains(t, err, "referenced-data-file")
+ })
+
+ // deletion vector with all required properties succeeds.
+ t.Run("deletion vector with all required properties", func(t
*testing.T) {
+ w, _ := newWriter()
+ _, err := w.AddBlob(puffin.BlobMetadataInput{
+ Type: puffin.BlobTypeDeletionVector, SnapshotID: -1,
SequenceNumber: -1, Fields: []int32{},
+ Properties: map[string]string{"cardinality": "0",
"referenced-data-file": "data/x.parquet"},
+ }, []byte("x"))
+ require.NoError(t, err)
+ })
}
// TestSetCreatedBy verifies the SetCreatedBy method.
diff --git a/puffin/puffin_writer.go b/puffin/puffin_writer.go
index 8083a8b3..14db49f2 100644
--- a/puffin/puffin_writer.go
+++ b/puffin/puffin_writer.go
@@ -23,6 +23,7 @@ import (
"fmt"
"io"
"math"
+ "strconv"
"github.com/apache/iceberg-go"
)
@@ -141,6 +142,25 @@ func (w *Writer) AddBlob(input BlobMetadataInput, data
[]byte) (BlobMetadata, er
return BlobMetadata{}, fmt.Errorf("puffin: %s requires
sequence-number to be -1, got %d",
BlobTypeDeletionVector, input.SequenceNumber)
}
+ // Properties cardinality and referenced-data-file are
spec-mandated
+ // for deletion-vector-v1; the reader in table/dv does not
enforce
+ // them today, so we hold the line here to keep Go-emitted files
+ // always conformant.
+ if input.Properties["cardinality"] == "" {
+ return BlobMetadata{}, errors.New("puffin:
deletion-vector-v1 requires a cardinality property")
+ }
+ // Reject non-numeric or negative values at write time too —
otherwise
+ // a writer could emit "cardinality": "abc" or "-1" that the
reader
+ // hard-rejects later. ParseUint covers both: "-1" fails as
invalid
+ // syntax (the minus is rejected before any value is parsed), so
+ // non-numeric and negative collapse into one error path.
+ if _, err := strconv.ParseUint(input.Properties["cardinality"],
10, 64); err != nil {
+ return BlobMetadata{}, fmt.Errorf("puffin:
deletion-vector-v1 cardinality property %q is not a valid non-negative integer:
%w",
+ input.Properties["cardinality"], err)
+ }
+ if input.Properties["referenced-data-file"] == "" {
+ return BlobMetadata{}, errors.New("puffin:
deletion-vector-v1 requires a referenced-data-file property")
+ }
}
meta := BlobMetadata{
diff --git a/table/dv/deletion_vector_test.go b/table/dv/deletion_vector_test.go
index e6e68470..34d3a12c 100644
--- a/table/dv/deletion_vector_test.go
+++ b/table/dv/deletion_vector_test.go
@@ -93,8 +93,13 @@ func wrapDVPayloadForTest(bitmapBytes []byte) []byte {
}
func writePuffinWithDVBlob(t *testing.T, dir string, dvBlobBytes []byte)
(string, puffin.BlobMetadata) {
+ // Callers of this wrapper all use the canonical 5-position fixture
+ // (small-alternating-values-position-index.bin), so hardcoding "5" is
+ // safe. Tests that exercise other bitmap sizes call
+ // writePuffinWithDVBlobAndProps directly with their own cardinality.
return writePuffinWithDVBlobAndProps(t, dir, dvBlobBytes,
map[string]string{
"referenced-data-file": "s3://bucket/data/data-001.parquet",
+ "cardinality": "5",
})
}
@@ -437,35 +442,16 @@ func TestReadDVCardinalityValidation(t *testing.T) {
assert.Equal(t, int64(5), bm.Cardinality())
})
- t.Run("malformed property is rejected", func(t *testing.T) {
- // Property present but unparseable — silently falling back to
-1
- // would mask writer bugs, so this is a hard error.
- dir := t.TempDir()
- path, meta := writePuffinWithDVBlobAndProps(t, dir,
dvBlobBytes, map[string]string{
- "referenced-data-file":
"s3://bucket/data/data-001.parquet",
- "cardinality": "not-a-number",
- })
- offset, size := meta.Offset, meta.Length
- _, err := ReadDV(iceio.LocalFS{}, newDVTestFile(path, 5,
&offset, &size))
- require.Error(t, err)
- assert.Contains(t, err.Error(), "invalid cardinality")
- })
-
- t.Run("negative cardinality is rejected (not silently treated as
skip-validation sentinel)", func(t *testing.T) {
- // -1 happens to alias DeserializeDV's internal skip-validation
- // sentinel; accepting it from blob properties would silently
- // disable the very check this PR adds. The pre-pass rejects all
- // negative values.
- dir := t.TempDir()
- path, meta := writePuffinWithDVBlobAndProps(t, dir,
dvBlobBytes, map[string]string{
- "referenced-data-file":
"s3://bucket/data/data-001.parquet",
- "cardinality": "-1",
- })
- offset, size := meta.Offset, meta.Length
- _, err := ReadDV(iceio.LocalFS{}, newDVTestFile(path, 5,
&offset, &size))
- require.Error(t, err)
- assert.Contains(t, err.Error(), "must be non-negative")
- })
+ // Note: the previous "malformed property is rejected" and "negative
+ // cardinality rejected on read" subtests were removed once the puffin
+ // writer started rejecting non-numeric and negative cardinality values
+ // at AddBlob time (via ParseUint). The reader-side checks in
+ // blobCardinality stay as belt-and-suspenders for third-party-written
+ // files but are no longer reachable from any in-tree writer path used
+ // to construct test fixtures. Writer-side coverage:
+ // TestWriterValidation/deletion_vector_non-numeric_cardinality
+ // TestWriterValidation/deletion_vector_negative_cardinality
+ // (both in puffin_test.go).
t.Run("manifest offset not found in footer is rejected with a precise
error", func(t *testing.T) {
// Manifest entry points at an in-bounds offset that doesn't
match