laskoviymishka commented on code in PR #1133:
URL: https://github.com/apache/iceberg-go/pull/1133#discussion_r3310389738


##########
table/orphan_cleanup.go:
##########
@@ -411,35 +482,13 @@ func makeFileWalkFunc(fn func(path string, info 
stdfs.FileInfo) error, pathTrans
        }
 }
 
-func identifyOrphanFiles(allFiles []string, referencedFiles map[string]bool, 
cfg *orphanCleanupConfig) ([]string, error) {
-       normalizedReferencedFiles := make(map[string]string)
-       for refPath := range referencedFiles {
-               normalizedPath := normalizeFilePathWithConfig(refPath, cfg)
-               normalizedReferencedFiles[normalizedPath] = refPath
-               // Also include the original path for direct lookup
-               normalizedReferencedFiles[refPath] = refPath
-       }
-
-       var orphans []string
-
-       for _, file := range allFiles {
-               isOrphan, err := isFileOrphan(file, referencedFiles, 
normalizedReferencedFiles, cfg)
-               if err != nil {
-                       return nil, fmt.Errorf("failed to determine if file %s 
is orphan: %w", file, err)
-               }
-
-               if isOrphan {
-                       orphans = append(orphans, file)
-               }
-       }
-
-       return orphans, nil
-}
-
 func isFileOrphan(file string, referencedFiles map[string]bool, 
normalizedReferencedFiles map[string]string, cfg *orphanCleanupConfig) (bool, 
error) {
        normalizedFile := normalizeFilePathWithConfig(file, cfg)
 
-       if referencedFiles[file] || referencedFiles[normalizedFile] {
+       if _, ok := referencedFiles[file]; ok {

Review Comment:
   This is actually a correctness fix hiding inside the refactor, not just a 
style change. The old `referencedFiles[file] || 
referencedFiles[normalizedFile]` returned the map value, which is `false` for 
metadata-file entries (manifest lists, manifests, statistics) — so those 
entries silently fell through into the `checkPrefixMismatch` branch and could 
trip `PrefixMismatchError` on a host mismatch. The comma-ok form correctly 
treats any key-in-map as referenced, regardless of the bool value.
   
   Worth a one-line comment locking in the intent ("any presence in 
`referencedFiles` means referenced; the bool distinguishes data vs metadata for 
gc.enabled, not membership") and ideally a regression test for the 
metadata-file-with-host-mismatch case.



##########
table/orphan_cleanup.go:
##########
@@ -195,24 +202,62 @@ func (t Table) executeOrphanCleanup(ctx context.Context, 
cfg *orphanCleanupConfi
                scanLocation = t.metadata.Location()
        }
 
-       referencedFiles, err := t.getReferencedFiles(fs, true)
-       if err != nil {
-               return OrphanCleanupResult{}, fmt.Errorf("failed to get 
referenced files: %w", err)
+       // Run the S3 walk and referenced-file collection concurrently.
+       var referencedFiles map[string]bool
+       var scannedFiles []scannedFile
+
+       g, gctx := errgroup.WithContext(ctx)
+       g.Go(func() error {
+               var err error
+               referencedFiles, err = t.getReferencedFiles(gctx, fs, 
cfg.maxConcurrency, true)
+
+               return err
+       })
+
+       g.Go(func() error {
+               cutoff := time.Now().Add(-cfg.olderThan)
+
+               return walkDirectory(fs, scanLocation, func(path string, info 
stdfs.FileInfo) error {
+                       if gctx.Err() != nil {
+                               return gctx.Err()
+                       }
+                       if info.IsDir() || !info.ModTime().Before(cutoff) {
+                               return nil
+                       }
+                       scannedFiles = append(scannedFiles, scannedFile{path: 
path, size: info.Size()})

Review Comment:
   Safe today — goroutine 1 only writes `referencedFiles`, goroutine 2 only 
writes `scannedFiles`, no cross-reads. But the contract that these two are 
partitioned is invisible to a future maintainer, and there's no mutex or 
comment to flag it.
   
   A short comment on the errgroup block ("goroutine 1 owns `referencedFiles`, 
goroutine 2 owns `scannedFiles`; no shared writes") would make the next edit 
safer. wdyt?



##########
table/orphan_cleanup.go:
##########
@@ -164,7 +166,7 @@ type OrphanCleanupResult struct {
        TotalSizeBytes      int64
 }
 
-func (t Table) DeleteOrphanFiles(ctx context.Context, opts 
...OrphanCleanupOption) (OrphanCleanupResult, error) {
+func (t *Table) DeleteOrphanFiles(ctx context.Context, opts 
...OrphanCleanupOption) (OrphanCleanupResult, error) {

Review Comment:
   `DeleteOrphanFiles`, `executeOrphanCleanup`, and `getReferencedFiles` moved 
to pointer receivers, but `PurgeFiles` further down still uses a value receiver 
and calls into `getReferencedFiles`. None of these methods actually mutate `t`, 
so I'd either revert all three back to value receivers, or move `PurgeFiles` to 
`*Table` as well — splitting the method set across receiver kinds in the public 
API is the kind of subtle thing that's hard to undo later. wdyt?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to