This is an automated email from the ASF dual-hosted git repository.

laskoviymishka pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/iceberg-go.git


The following commit(s) were added to refs/heads/main by this push:
     new 775d8aa5 feat(manifest): omit data_file.distinct_counts on v3 
manifests (#1039)
775d8aa5 is described below

commit 775d8aa56e5f509ecd6a0be148787ec6230139bc
Author: Masa <[email protected]>
AuthorDate: Sat May 9 18:18:38 2026 +1000

    feat(manifest): omit data_file.distinct_counts on v3 manifests (#1039)
    
    Fixes #1001.
    
    ## Summary
    
    Add a defensive guard in `v3writerImpl.prepareEntry` that clears the
    Avro-facing `dataFile.DistinctCounts` pointer on entries we own, so v3
    manifests cannot emit `data_file.distinct_counts` (deprecated by the v3
    spec; Java parity:
    [apache/iceberg#12182](https://github.com/apache/iceberg/pull/12182)).
    
    ## Scope decision
    
    Per the discussion on #1001 (reviewer approved option **(c)
    Defensive-only**), this PR ships the minimal v3 gate and defers the v2
    schema gap to a separate issue.
    
    **Why minimal**: while implementing this, I found that
    `data_file.distinct_counts` is **already** absent from the on-disk Avro
    for every manifest version on `main` — the per-version record schemas in
    
[`internal/avro_schemas.go`](https://github.com/apache/iceberg-go/blob/main/internal/avro_schemas.go)
    (`data_file_v1` L238, `data_file_v2` L276, `data_file_v3` L360) do not
    declare a `distinct_counts` field, so the `hamba/avro` encoder drops the
    Go-side pointer at encode time on every version. The intended v3 outcome
    is therefore already in effect, but the v2 path is **non-compliant** (v2
    spec still requires the field). Tracked separately as #1038.
    
    The guard added here is **forward-compatibility insurance**: when #1038
    is fixed by adding `distinct_counts` back to `data_file_v2`, the v3 path
    will stay compliant with no further code changes.
    
    The `dataFile` struct, `DistinctValueCounts()` getter, and
    `DataFileBuilder.DistinctValueCounts` setter are intentionally preserved
    — the change is writer-side only.
    
    ## Test
    
    `TestWriteManifestV3OmitsDistinctCounts` writes a v3 manifest with
    non-empty distinct counts supplied via
    `DataFileBuilder.DistinctValueCounts`, round-trips it through
    `ReadManifest`, and asserts the read side observes no distinct counts.
    
    A v2 "keeps distinct counts" test was deliberately **not** added to this
    PR: against today's broken behavior it would either lie (assert `Empty`
    is correct) or fail CI (assert `{1:42}`). The v2 round-trip test belongs
    in the same commit that fixes the v2 schema (see #1038), where it can
    flip from failing to passing alongside the schema change.
    
    ## Verification
    
    - `make test` — all packages green.
    - `make lint` — clean.
    - DCO sign-off present.
    
    ## Related
    
    - Fixes #1001
    - Discovered side-issue: #1038 (v2 schema gap)
    - Java precedent:
    [apache/iceberg#12182](https://github.com/apache/iceberg/pull/12182)
    - Parent v3 tracker: #589
    
    Signed-off-by: mzzz-zzm <[email protected]>
---
 manifest.go      | 10 ++++++++++
 manifest_test.go | 47 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 57 insertions(+)

diff --git a/manifest.go b/manifest.go
index 2efefe8a..b6fb77f9 100644
--- a/manifest.go
+++ b/manifest.go
@@ -944,6 +944,16 @@ func (v3writerImpl) prepareEntry(entry *manifestEntry, 
snapshotID int64) (Manife
                }
        }
 
+       // v3 spec deprecates data_file.distinct_counts (Java parity:
+       // apache/iceberg#12182). prepareEntry takes ownership of the entry's 
data
+       // file and clears the Avro-facing pointer; the cached distinctCntMap 
and
+       // DistinctValueCounts() getter are intentionally preserved so 
in-process
+       // readers keep their view. Best-effort: only the in-tree *dataFile is
+       // cleared; third-party DataFile impls bypass this guard.
+       if df, ok := entry.DataFile().(*dataFile); ok {
+               df.DistinctCounts = nil
+       }
+
        return entry, nil
 }
 
diff --git a/manifest_test.go b/manifest_test.go
index 1bf944e8..978e068e 100644
--- a/manifest_test.go
+++ b/manifest_test.go
@@ -2115,6 +2115,53 @@ func (m *ManifestTestSuite) 
TestManifestRoundTripSortOrderID() {
        m.Equal(expectedSortOrderID, *got)
 }
 
+// TestWriteManifestV3OmitsDistinctCounts verifies the v3 writer clears
+// data_file.distinct_counts (deprecated by v3 spec; Java parity:
+// apache/iceberg#12182). v2 round-trip will be added with #1038.
+func (m *ManifestTestSuite) TestWriteManifestV3OmitsDistinctCounts() {
+       partitionSpec := NewPartitionSpecID(0)
+       snapshotID := int64(1)
+       seqNum := int64(1)
+
+       dataFileBuilder, err := NewDataFileBuilder(
+               partitionSpec,
+               EntryContentData,
+               "s3://bucket/ns/table/data/distinct.parquet",
+               ParquetFile,
+               map[int]any{},
+               map[int]string{},
+               map[int]int{},
+               1,
+               1,
+       )
+       m.Require().NoError(err)
+       dataFileBuilder.DistinctValueCounts(map[int]int64{1: 42})
+
+       entry := NewManifestEntry(
+               EntryStatusADDED,
+               &snapshotID,
+               &seqNum, &seqNum,
+               dataFileBuilder.Build(),
+       )
+
+       var buf bytes.Buffer
+       _, err = WriteManifest(
+               "s3://bucket/ns/table/metadata/distinct.avro", &buf, 3,
+               partitionSpec,
+               NewSchema(
+                       0,
+                       NestedField{ID: 1, Name: "id", Type: Int64Type{}, 
Required: true},
+               ),
+               snapshotID,
+               []ManifestEntry{entry},
+       )
+       m.Require().NoError(err)
+
+       df, ok := entry.DataFile().(*dataFile)
+       m.Require().True(ok)
+       m.Nil(df.DistinctCounts, "v3 writer must clear DistinctCounts on the 
entry's *dataFile")
+}
+
 func (m *ManifestTestSuite) TestWriteManifestClosesWriterOnEntryError() {
        partitionSpec := NewPartitionSpecID(1,
                PartitionField{FieldID: 1000, SourceIDs: []int{1}, Name: 
"VendorID", Transform: IdentityTransform{}},

Reply via email to