This is an automated email from the ASF dual-hosted git repository.
zeroshade pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/iceberg-go.git
The following commit(s) were added to refs/heads/main by this push:
new a38f57ff fix(manifest): restore data_file.distinct_counts on v1/v2
Avro schemas (#1044)
a38f57ff is described below
commit a38f57ff5171ff10ebc354992d8d49c253893ccf
Author: Masa <[email protected]>
AuthorDate: Wed May 13 02:21:21 2026 +1000
fix(manifest): restore data_file.distinct_counts on v1/v2 Avro schemas
(#1044)
v1 and v2 manifest writers silently dropped data_file.distinct_counts
(field id 111) because it was missing from the data_file_v1 and
data_file_v2 record declarations in internal/avro_schemas.go. The
hamba/avro encoder writes only declared fields, so the Go-side *[]colMap
pointer was discarded on encode for every version.
The Iceberg v1 and v2 specs list distinct_counts as a writable optional
field (map<123: int, 124: long>); the v3 spec deprecates it (see
apache/iceberg#12182 and #1001/#1039). This commit:
- Adds the distinct_counts field to data_file_v1 and data_file_v2 with
the canonical map element ids (key=123, value=124), inserted after
nan_value_counts to match the spec's field ordering.
- Leaves data_file_v3 unchanged. The defensive guard added in #1039
(v3writerImpl.prepareEntry) becomes load-bearing once this lands: it
ensures v3 manifests still omit the field even though the encoder is now
capable of serializing it via the v1/v2 schema pathway.
Tests:
- TestWriteManifestV2KeepsDistinctCounts - v2 round-trip preserves the
supplied distinct counts.
- TestWriteManifestV1KeepsDistinctCounts - v1 round-trip preserves them
too.
Both tests fail on origin/main (encoder drops the field, returns nil)
and pass after the schema additions, exercising the change directly.
Fixes #1038
Related: #1001, #1039
Signed-off-by: mzzz-zzm <[email protected]>
---
internal/avro_schemas.go | 6 +++++
manifest_test.go | 61 ++++++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 67 insertions(+)
diff --git a/internal/avro_schemas.go b/internal/avro_schemas.go
index 720f4be4..2dc8ba8f 100644
--- a/internal/avro_schemas.go
+++ b/internal/avro_schemas.go
@@ -259,6 +259,9 @@ func init() {
fieldNode("nan_value_counts",
NullableNode(newMapNode("k138_v139", IntNode,
LongNode, 138, 139)),
137, withDoc("map of value to count")),
+ fieldNode("distinct_counts",
+ NullableNode(newMapNode("k123_v124", IntNode,
LongNode, 123, 124)),
+ 111, withDoc("map of column id to distinct
value count")),
fieldNode("lower_bounds",
NullableNode(newMapNode("k126_v127", IntNode,
BytesNode, 126, 127)),
125, withDoc("map of column id to lower
bound")),
@@ -295,6 +298,9 @@ func init() {
fieldNode("nan_value_counts",
NullableNode(newMapNode("k138_v139", IntNode,
LongNode, 138, 139)),
137, withDoc("map of value to count")),
+ fieldNode("distinct_counts",
+ NullableNode(newMapNode("k123_v124", IntNode,
LongNode, 123, 124)),
+ 111, withDoc("map of column id to distinct
value count")),
fieldNode("lower_bounds",
NullableNode(newMapNode("k126_v127", IntNode,
BytesNode, 126, 127)),
125, withDoc("map of column id to lower
bound")),
diff --git a/manifest_test.go b/manifest_test.go
index 978e068e..69e53cf0 100644
--- a/manifest_test.go
+++ b/manifest_test.go
@@ -2334,3 +2334,64 @@ func (m *ManifestTestSuite)
TestEntriesCloseErrorAsFinalPair() {
"terminal error must equal or wrap the simulated close error")
m.Equal(1, file.closeCount, "file must be closed exactly once even when
Close returns an error")
}
+
+// TestWriteManifestV2KeepsDistinctCounts is a regression guard that v2
+// manifest writers preserve data_file.distinct_counts (id 111) per the
+// Iceberg v2 spec. Fixes #1038.
+func (m *ManifestTestSuite) TestWriteManifestV2KeepsDistinctCounts() {
+ m.assertDistinctCountsRoundTrip(2)
+}
+
+// TestWriteManifestV1KeepsDistinctCounts is a regression guard that v1
+// manifest writers preserve data_file.distinct_counts (id 111) per the
+// Iceberg v1 spec. Fixes #1038.
+func (m *ManifestTestSuite) TestWriteManifestV1KeepsDistinctCounts() {
+ m.assertDistinctCountsRoundTrip(1)
+}
+
+// assertDistinctCountsRoundTrip writes a manifest at the given format
+// version with distinct_counts populated for one column, round-trips it
+// through ReadManifest, and asserts the read side observes the same map.
+func (m *ManifestTestSuite) assertDistinctCountsRoundTrip(version int) {
+ partitionSpec := NewPartitionSpecID(0)
+ snapshotID := int64(1)
+ seqNum := int64(1)
+
+ dataFileBuilder, err := NewDataFileBuilder(
+ partitionSpec,
+ EntryContentData,
+ "s3://bucket/ns/table/data/distinct.parquet",
+ ParquetFile,
+ map[int]any{},
+ map[int]string{},
+ map[int]int{},
+ 1,
+ 1,
+ )
+ m.Require().NoError(err)
+ dataFileBuilder.DistinctValueCounts(map[int]int64{1: 42})
+
+ var buf bytes.Buffer
+ file, err := WriteManifest(
+ "s3://bucket/ns/table/metadata/distinct.avro", &buf, version,
+ partitionSpec,
+ NewSchema(0,
+ NestedField{ID: 1, Name: "id", Type: Int64Type{},
Required: true},
+ ),
+ snapshotID,
+ []ManifestEntry{NewManifestEntry(
+ EntryStatusADDED,
+ &snapshotID,
+ &seqNum, &seqNum,
+ dataFileBuilder.Build(),
+ )},
+ )
+ m.Require().NoError(err)
+
+ entries, err := ReadManifest(file, &buf, false)
+ m.Require().NoError(err)
+ m.Require().Len(entries, 1)
+
+ m.Equal(map[int]int64{1: 42},
entries[0].DataFile().DistinctValueCounts(),
+ "manifest writer must preserve distinct_counts for the
requested format version")
+}