This is an automated email from the ASF dual-hosted git repository.

zeroshade pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/iceberg-go.git


The following commit(s) were added to refs/heads/main by this push:
     new a38f57ff fix(manifest): restore data_file.distinct_counts on v1/v2 
Avro schemas (#1044)
a38f57ff is described below

commit a38f57ff5171ff10ebc354992d8d49c253893ccf
Author: Masa <[email protected]>
AuthorDate: Wed May 13 02:21:21 2026 +1000

    fix(manifest): restore data_file.distinct_counts on v1/v2 Avro schemas 
(#1044)
    
    v1 and v2 manifest writers silently dropped data_file.distinct_counts
    (field id 111) because it was missing from the data_file_v1 and
    data_file_v2 record declarations in internal/avro_schemas.go. The
    hamba/avro encoder writes only declared fields, so the Go-side *[]colMap
    pointer was discarded on encode for every version.
    
    The Iceberg v1 and v2 specs list distinct_counts as a writable optional
    field (map<123: int, 124: long>); the v3 spec deprecates it (see
    apache/iceberg#12182 and #1001/#1039). This commit:
    
    - Adds the distinct_counts field to data_file_v1 and data_file_v2 with
    the canonical map element ids (key=123, value=124), inserted after
    nan_value_counts to match the spec's field ordering.
    - Leaves data_file_v3 unchanged. The defensive guard added in #1039
    (v3writerImpl.prepareEntry) becomes load-bearing once this lands: it
    ensures v3 manifests still omit the field even though the encoder is now
    capable of serializing it via the v1/v2 schema pathway.
    
    Tests:
    - TestWriteManifestV2KeepsDistinctCounts - v2 round-trip preserves the
    supplied distinct counts.
    - TestWriteManifestV1KeepsDistinctCounts - v1 round-trip preserves them
    too.
    
    Both tests fail on origin/main (encoder drops the field, returns nil)
    and pass after the schema additions, exercising the change directly.
    
    Fixes #1038
    Related: #1001, #1039
    
    Signed-off-by: mzzz-zzm <[email protected]>
---
 internal/avro_schemas.go |  6 +++++
 manifest_test.go         | 61 ++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 67 insertions(+)

diff --git a/internal/avro_schemas.go b/internal/avro_schemas.go
index 720f4be4..2dc8ba8f 100644
--- a/internal/avro_schemas.go
+++ b/internal/avro_schemas.go
@@ -259,6 +259,9 @@ func init() {
                        fieldNode("nan_value_counts",
                                NullableNode(newMapNode("k138_v139", IntNode, 
LongNode, 138, 139)),
                                137, withDoc("map of value to count")),
+                       fieldNode("distinct_counts",
+                               NullableNode(newMapNode("k123_v124", IntNode, 
LongNode, 123, 124)),
+                               111, withDoc("map of column id to distinct 
value count")),
                        fieldNode("lower_bounds",
                                NullableNode(newMapNode("k126_v127", IntNode, 
BytesNode, 126, 127)),
                                125, withDoc("map of column id to lower 
bound")),
@@ -295,6 +298,9 @@ func init() {
                        fieldNode("nan_value_counts",
                                NullableNode(newMapNode("k138_v139", IntNode, 
LongNode, 138, 139)),
                                137, withDoc("map of value to count")),
+                       fieldNode("distinct_counts",
+                               NullableNode(newMapNode("k123_v124", IntNode, 
LongNode, 123, 124)),
+                               111, withDoc("map of column id to distinct 
value count")),
                        fieldNode("lower_bounds",
                                NullableNode(newMapNode("k126_v127", IntNode, 
BytesNode, 126, 127)),
                                125, withDoc("map of column id to lower 
bound")),
diff --git a/manifest_test.go b/manifest_test.go
index 978e068e..69e53cf0 100644
--- a/manifest_test.go
+++ b/manifest_test.go
@@ -2334,3 +2334,64 @@ func (m *ManifestTestSuite) 
TestEntriesCloseErrorAsFinalPair() {
                "terminal error must equal or wrap the simulated close error")
        m.Equal(1, file.closeCount, "file must be closed exactly once even when 
Close returns an error")
 }
+
+// TestWriteManifestV2KeepsDistinctCounts is a regression guard that v2
+// manifest writers preserve data_file.distinct_counts (id 111) per the
+// Iceberg v2 spec. Fixes #1038.
+func (m *ManifestTestSuite) TestWriteManifestV2KeepsDistinctCounts() {
+       m.assertDistinctCountsRoundTrip(2)
+}
+
+// TestWriteManifestV1KeepsDistinctCounts is a regression guard that v1
+// manifest writers preserve data_file.distinct_counts (id 111) per the
+// Iceberg v1 spec. Fixes #1038.
+func (m *ManifestTestSuite) TestWriteManifestV1KeepsDistinctCounts() {
+       m.assertDistinctCountsRoundTrip(1)
+}
+
+// assertDistinctCountsRoundTrip writes a manifest at the given format
+// version with distinct_counts populated for one column, round-trips it
+// through ReadManifest, and asserts the read side observes the same map.
+func (m *ManifestTestSuite) assertDistinctCountsRoundTrip(version int) {
+       partitionSpec := NewPartitionSpecID(0)
+       snapshotID := int64(1)
+       seqNum := int64(1)
+
+       dataFileBuilder, err := NewDataFileBuilder(
+               partitionSpec,
+               EntryContentData,
+               "s3://bucket/ns/table/data/distinct.parquet",
+               ParquetFile,
+               map[int]any{},
+               map[int]string{},
+               map[int]int{},
+               1,
+               1,
+       )
+       m.Require().NoError(err)
+       dataFileBuilder.DistinctValueCounts(map[int]int64{1: 42})
+
+       var buf bytes.Buffer
+       file, err := WriteManifest(
+               "s3://bucket/ns/table/metadata/distinct.avro", &buf, version,
+               partitionSpec,
+               NewSchema(0,
+                       NestedField{ID: 1, Name: "id", Type: Int64Type{}, 
Required: true},
+               ),
+               snapshotID,
+               []ManifestEntry{NewManifestEntry(
+                       EntryStatusADDED,
+                       &snapshotID,
+                       &seqNum, &seqNum,
+                       dataFileBuilder.Build(),
+               )},
+       )
+       m.Require().NoError(err)
+
+       entries, err := ReadManifest(file, &buf, false)
+       m.Require().NoError(err)
+       m.Require().Len(entries, 1)
+
+       m.Equal(map[int]int64{1: 42}, 
entries[0].DataFile().DistinctValueCounts(),
+               "manifest writer must preserve distinct_counts for the 
requested format version")
+}

Reply via email to