This is an automated email from the ASF dual-hosted git repository.
laskoviymishka pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/iceberg-go.git
The following commit(s) were added to refs/heads/main by this push:
new e308fedb Skip DELETED tombstones in DeleteOrphanFiles' referenced set
(#1076)
e308fedb is described below
commit e308fedba4bbeeec8c142afa9c77e27698639ecf
Author: Krutika Dhananjay <[email protected]>
AuthorDate: Thu May 14 02:19:56 2026 +0530
Skip DELETED tombstones in DeleteOrphanFiles' referenced set (#1076)
DeleteOrphanFiles builds "referenced files" set by walking each live
snapshot's manifests, fetching every entry with discardDeleted=false,
and unconditionally marking entry as referenced. This treats
DELETED-status entries (tombstones) the same as ADDED/EXISTING entries,
so any data file that had ever been overwritten or deleted in a
copy-on-write operation remains referenced by the surviving snapshot's
tombstone manifest. After every snapshot that actually held the file
live is expired, the file is still reachable and orphan cleanup skips
it.
This is a deviation from iceberg-java and iceberg-python
implementations.
Switch manifest.Entries() to use discardDeleted=true so tombstones are
excluded before they reach the reference-collection loop.
---------
Co-authored-by: Krutika Dhananjay <[email protected]>
---
table/orphan_cleanup.go | 8 ++-
table/orphan_cleanup_test.go | 129 +++++++++++++++++++++++++++++++++++++++++++
2 files changed, 136 insertions(+), 1 deletion(-)
diff --git a/table/orphan_cleanup.go b/table/orphan_cleanup.go
index 81fa84ed..efc1e21a 100644
--- a/table/orphan_cleanup.go
+++ b/table/orphan_cleanup.go
@@ -268,7 +268,13 @@ func (t Table) getReferencedFiles(fs iceio.IO)
(map[string]bool, error) {
for _, manifest := range manifestFiles {
referenced[manifest.FilePath()] = true
- for entry, err := range manifest.Entries(fs, false) {
+ // discardDeleted=true: skip DELETED-status entries when
+ // computing the reachable file set. A DELETED entry is
+ // not live in this snapshot and should not pin the file
+ // against orphan cleanup once the snapshot that
+ // originally held it live has been expired.
+ // This matches iceberg-java and pyiceberg behavior.
+ for entry, err := range manifest.Entries(fs, true) {
if err != nil {
return nil, fmt.Errorf("failed to read
manifest entries: %w", err)
}
diff --git a/table/orphan_cleanup_test.go b/table/orphan_cleanup_test.go
index a3b2cb23..43b01266 100644
--- a/table/orphan_cleanup_test.go
+++ b/table/orphan_cleanup_test.go
@@ -23,6 +23,10 @@ import (
"testing"
"time"
+ "github.com/apache/arrow-go/v18/arrow"
+ "github.com/apache/arrow-go/v18/arrow/array"
+ "github.com/apache/arrow-go/v18/arrow/memory"
+ "github.com/apache/iceberg-go"
"github.com/apache/iceberg-go/io"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
@@ -620,3 +624,128 @@ func TestDeleteFilesFallsBackToExistingBehavior(t
*testing.T) {
assert.ElementsMatch(t, orphans, mock.removed)
assert.ElementsMatch(t, orphans, deleted)
}
+
+type inMemoryCatalog struct {
+ metadata Metadata
+}
+
+func (c *inMemoryCatalog) CommitTable(
+ ctx context.Context,
+ ident Identifier,
+ reqs []Requirement,
+ updates []Update,
+) (Metadata, string, error) {
+ meta, err := UpdateTableMetadata(c.metadata, updates, "")
+ if err != nil {
+ return nil, "", err
+ }
+ c.metadata = meta
+
+ return meta, "", nil
+}
+
+func (c *inMemoryCatalog) LoadTable(ctx context.Context, ident Identifier)
(*Table, error) {
+ return nil, nil
+}
+
+func TestGetReferencedFiles_OverwriteThenExpireExcludesTombstones(t
*testing.T) {
+ ctx := context.Background()
+ tableLocation := t.TempDir()
+
+ schema := iceberg.NewSchema(0,
+ iceberg.NestedField{ID: 1, Name: "id", Type:
iceberg.PrimitiveTypes.Int64, Required: true},
+ )
+ arrSchema := arrow.NewSchema([]arrow.Field{
+ {Name: "id", Type: arrow.PrimitiveTypes.Int64, Nullable: false},
+ }, nil)
+ spec := *iceberg.UnpartitionedSpec
+
+ meta, err := NewMetadata(schema, &spec, UnsortedSortOrder,
tableLocation,
+ iceberg.Properties{PropertyFormatVersion: "2"})
+ require.NoError(t, err)
+
+ fs := io.LocalFS{}
+ tbl := New(
+ Identifier{"db", "tbl"},
+ meta,
+ tableLocation+"/metadata/v0.metadata.json",
+ func(context.Context) (io.IO, error) { return fs, nil },
+ &inMemoryCatalog{meta},
+ )
+
+ // Step 1: append id=1. Produces snapshot 1 with one ADDED entry for
fileA.
+ arrA, err := array.TableFromJSON(memory.DefaultAllocator, arrSchema,
[]string{`[{"id": 1}]`})
+ require.NoError(t, err)
+ defer arrA.Release()
+ tbl, err = tbl.AppendTable(ctx, arrA, 1, nil)
+ require.NoError(t, err)
+
+ snap1 := tbl.CurrentSnapshot()
+ require.NotNil(t, snap1)
+ pathA := dataFilePathsFromSnapshot(t, snap1, fs,
iceberg.EntryStatusADDED)
+ require.Len(t, pathA, 1, "expected one ADDED data file after append")
+ fileA := pathA[0]
+
+ // Step 2: overwrite with id=2. Produces snapshot 2 whose manifest list
+ // contains [added-fileB-manifest, deleted-fileA-manifest]. fileA still
+ // lives in snapshot 1's manifest as ADDED at this point.
+ arrB, err := array.TableFromJSON(memory.DefaultAllocator, arrSchema,
[]string{`[{"id": 2}]`})
+ require.NoError(t, err)
+ defer arrB.Release()
+ tbl, err = tbl.OverwriteTable(ctx, arrB, 1, nil)
+ require.NoError(t, err)
+ require.Len(t, tbl.Metadata().Snapshots(), 2, "expected two snapshots
after overwrite")
+
+ pathB := dataFilePathsFromSnapshot(t, tbl.CurrentSnapshot(), fs,
iceberg.EntryStatusADDED)
+ require.Len(t, pathB, 1, "expected one ADDED data file after overwrite")
+ fileB := pathB[0]
+
+ // Step 3: expire snapshot 1, keeping only the overwrite snapshot.
+ // WithPostCommit(false) keeps fileA on disk so the test only exercises
+ // metadata reachability, not the side-effect of file removal.
+ tx := tbl.NewTransaction()
+ require.NoError(t, tx.ExpireSnapshots(
+ WithRetainLast(1),
+ WithOlderThan(0),
+ WithPostCommit(false),
+ ))
+ tbl, err = tx.Commit(ctx)
+ require.NoError(t, err)
+ require.Len(t, tbl.Metadata().Snapshots(), 1,
+ "only the overwrite snapshot should remain after expiration")
+
+ // fileA is now referenced only via a DELETED entry in the surviving
+ // snapshot's tombstone manifest. The fix must exclude it.
+ refs, err := tbl.getReferencedFiles(fs)
+ require.NoError(t, err)
+
+ assert.True(t, refs[fileB],
+ "new live file (ADDED in surviving snapshot) must be in
reference set")
+ assert.False(t, refs[fileA],
+ "overwritten file (only present as DELETED tombstone) must NOT
be in reference set")
+}
+
+// dataFilePathsFromSnapshot returns the data-file paths referenced by the
+// given snapshot's manifests, filtered to entries matching wantStatus.
+func dataFilePathsFromSnapshot(
+ t *testing.T,
+ snap *Snapshot,
+ fs io.IO,
+ wantStatus iceberg.ManifestEntryStatus,
+) []string {
+ t.Helper()
+ manifests, err := snap.Manifests(fs)
+ require.NoError(t, err)
+
+ var paths []string
+ for _, m := range manifests {
+ for e, err := range m.Entries(fs, false) {
+ require.NoError(t, err)
+ if e.Status() == wantStatus {
+ paths = append(paths, e.DataFile().FilePath())
+ }
+ }
+ }
+
+ return paths
+}