This is an automated email from the ASF dual-hosted git repository.
laskoviymishka pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/iceberg-go.git
The following commit(s) were added to refs/heads/main by this push:
new 775d8aa5 feat(manifest): omit data_file.distinct_counts on v3
manifests (#1039)
775d8aa5 is described below
commit 775d8aa56e5f509ecd6a0be148787ec6230139bc
Author: Masa <[email protected]>
AuthorDate: Sat May 9 18:18:38 2026 +1000
feat(manifest): omit data_file.distinct_counts on v3 manifests (#1039)
Fixes #1001.
## Summary
Add a defensive guard in `v3writerImpl.prepareEntry` that clears the
Avro-facing `dataFile.DistinctCounts` pointer on entries we own, so v3
manifests cannot emit `data_file.distinct_counts` (deprecated by the v3
spec; Java parity:
[apache/iceberg#12182](https://github.com/apache/iceberg/pull/12182)).
## Scope decision
Per the discussion on #1001 (reviewer approved option **(c)
Defensive-only**), this PR ships the minimal v3 gate and defers the v2
schema gap to a separate issue.
**Why minimal**: while implementing this, I found that
`data_file.distinct_counts` is **already** absent from the on-disk Avro
for every manifest version on `main` — the per-version record schemas in
[`internal/avro_schemas.go`](https://github.com/apache/iceberg-go/blob/main/internal/avro_schemas.go)
(`data_file_v1` L238, `data_file_v2` L276, `data_file_v3` L360) do not
declare a `distinct_counts` field, so the `hamba/avro` encoder drops the
Go-side pointer at encode time on every version. The intended v3 outcome
is therefore already in effect, but the v2 path is **non-compliant** (v2
spec still requires the field). Tracked separately as #1038.
The guard added here is **forward-compatibility insurance**: when #1038
is fixed by adding `distinct_counts` back to `data_file_v2`, the v3 path
will stay compliant with no further code changes.
The `dataFile` struct, `DistinctValueCounts()` getter, and
`DataFileBuilder.DistinctValueCounts` setter are intentionally preserved
— the change is writer-side only.
## Test
`TestWriteManifestV3OmitsDistinctCounts` writes a v3 manifest with
non-empty distinct counts supplied via
`DataFileBuilder.DistinctValueCounts`, round-trips it through
`ReadManifest`, and asserts the read side observes no distinct counts.
A v2 "keeps distinct counts" test was deliberately **not** added to this
PR: against today's broken behavior it would either lie (assert `Empty`
is correct) or fail CI (assert `{1:42}`). The v2 round-trip test belongs
in the same commit that fixes the v2 schema (see #1038), where it can
flip from failing to passing alongside the schema change.
## Verification
- `make test` — all packages green.
- `make lint` — clean.
- DCO sign-off present.
## Related
- Fixes #1001
- Discovered side-issue: #1038 (v2 schema gap)
- Java precedent:
[apache/iceberg#12182](https://github.com/apache/iceberg/pull/12182)
- Parent v3 tracker: #589
Signed-off-by: mzzz-zzm <[email protected]>
---
manifest.go | 10 ++++++++++
manifest_test.go | 47 +++++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 57 insertions(+)
diff --git a/manifest.go b/manifest.go
index 2efefe8a..b6fb77f9 100644
--- a/manifest.go
+++ b/manifest.go
@@ -944,6 +944,16 @@ func (v3writerImpl) prepareEntry(entry *manifestEntry,
snapshotID int64) (Manife
}
}
+ // v3 spec deprecates data_file.distinct_counts (Java parity:
+ // apache/iceberg#12182). prepareEntry takes ownership of the entry's
data
+ // file and clears the Avro-facing pointer; the cached distinctCntMap
and
+ // DistinctValueCounts() getter are intentionally preserved so
in-process
+ // readers keep their view. Best-effort: only the in-tree *dataFile is
+ // cleared; third-party DataFile impls bypass this guard.
+ if df, ok := entry.DataFile().(*dataFile); ok {
+ df.DistinctCounts = nil
+ }
+
return entry, nil
}
diff --git a/manifest_test.go b/manifest_test.go
index 1bf944e8..978e068e 100644
--- a/manifest_test.go
+++ b/manifest_test.go
@@ -2115,6 +2115,53 @@ func (m *ManifestTestSuite)
TestManifestRoundTripSortOrderID() {
m.Equal(expectedSortOrderID, *got)
}
+// TestWriteManifestV3OmitsDistinctCounts verifies the v3 writer clears
+// data_file.distinct_counts (deprecated by v3 spec; Java parity:
+// apache/iceberg#12182). v2 round-trip will be added with #1038.
+func (m *ManifestTestSuite) TestWriteManifestV3OmitsDistinctCounts() {
+ partitionSpec := NewPartitionSpecID(0)
+ snapshotID := int64(1)
+ seqNum := int64(1)
+
+ dataFileBuilder, err := NewDataFileBuilder(
+ partitionSpec,
+ EntryContentData,
+ "s3://bucket/ns/table/data/distinct.parquet",
+ ParquetFile,
+ map[int]any{},
+ map[int]string{},
+ map[int]int{},
+ 1,
+ 1,
+ )
+ m.Require().NoError(err)
+ dataFileBuilder.DistinctValueCounts(map[int]int64{1: 42})
+
+ entry := NewManifestEntry(
+ EntryStatusADDED,
+ &snapshotID,
+ &seqNum, &seqNum,
+ dataFileBuilder.Build(),
+ )
+
+ var buf bytes.Buffer
+ _, err = WriteManifest(
+ "s3://bucket/ns/table/metadata/distinct.avro", &buf, 3,
+ partitionSpec,
+ NewSchema(
+ 0,
+ NestedField{ID: 1, Name: "id", Type: Int64Type{},
Required: true},
+ ),
+ snapshotID,
+ []ManifestEntry{entry},
+ )
+ m.Require().NoError(err)
+
+ df, ok := entry.DataFile().(*dataFile)
+ m.Require().True(ok)
+ m.Nil(df.DistinctCounts, "v3 writer must clear DistinctCounts on the
entry's *dataFile")
+}
+
func (m *ManifestTestSuite) TestWriteManifestClosesWriterOnEntryError() {
partitionSpec := NewPartitionSpecID(1,
PartitionField{FieldID: 1000, SourceIDs: []int{1}, Name:
"VendorID", Transform: IdentityTransform{}},