tanmayrauth commented on code in PR #1136:
URL: https://github.com/apache/iceberg-go/pull/1136#discussion_r3316558195


##########
table/dv/dv_writer.go:
##########
@@ -21,51 +21,102 @@ import (
        "bytes"
        "context"
        "fmt"
+       "maps"
        "strconv"
 
        "github.com/apache/iceberg-go"
        iceio "github.com/apache/iceberg-go/io"
        "github.com/apache/iceberg-go/puffin"
 )
 
+// dvEntry holds the per-data-file state a DV manifest entry needs. Each entry
+// is keyed by the referenced data file path; the spec and partition record
+// come from the data file itself and are propagated unchanged onto the output
+// DV manifest entry — matching Java's BaseDVFileWriter.Deletes shape.
+type dvEntry struct {
+       bitmap        *RoaringPositionBitmap
+       spec          iceberg.PartitionSpec
+       partitionData map[int]any
+}
+
 // DVWriter accumulates deletion positions per data file and flushes them as
 // a single Puffin file containing one deletion-vector-v1 blob per data file.
 // The returned DataFile entries are ready for RowDelta.AddDeletes().
 type DVWriter struct {
        fs      iceio.WriteFileIO
-       entries map[string]*RoaringPositionBitmap
+       entries map[string]*dvEntry
        order   []string
 }
 
 // NewDVWriter creates a DVWriter backed by the given writable filesystem.
 func NewDVWriter(fs iceio.WriteFileIO) *DVWriter {
        return &DVWriter{
                fs:      fs,
-               entries: make(map[string]*RoaringPositionBitmap),
+               entries: make(map[string]*dvEntry),
        }
 }
 
-// Add accumulates positions to delete for a given data file.
-// Positions are deduplicated via the underlying roaring bitmap.
-func (w *DVWriter) Add(dataFilePath string, positions []int64) {
-       bm, ok := w.entries[dataFilePath]
+// Add accumulates positions to delete for a given data file. The spec and
+// partitionData are the data file's own partition metadata (from its manifest
+// entry) and are propagated to the output DV manifest entry, so partitioned
+// tables produce DV entries with the correct partition record. Positions are
+// deduplicated via the underlying roaring bitmap.
+//
+// partitionData keys must be partition field IDs (PartitionField.FieldID), not
+// source column IDs. NewDataFileBuilder iterates spec.Fields() and re-keys by
+// field name using these IDs; wrong keys silently produce an empty partition
+// record on the output DataFile.
+//
+// First Add for a given dataFilePath captures spec and partitionData on the
+// entry; later Adds for the same path append positions only and ignore the
+// new spec/partitionData args. This mirrors Java's BaseDVFileWriter, which
+// stores partition metadata via computeIfAbsent on the same key. Callers must
+// not pass conflicting partition values across Adds for the same data file —
+// the writer trusts the first-Add values for the rest of the writer's life.
+func (w *DVWriter) Add(dataFilePath string, positions []int64, spec 
iceberg.PartitionSpec, partitionData map[int]any) {
+       entry, ok := w.entries[dataFilePath]
        if !ok {
-               bm = NewRoaringPositionBitmap()
-               w.entries[dataFilePath] = bm
+               // Defensive copies on capture so a caller that mutates or 
reuses

Review Comment:
   Super minor doc nit, feel free to ignore: I tried mutating 
originalSpec.Fields()[0].SourceIDs[0] after Add and the captured spec did see 
the change, since PartitionField is value-copied but the inner SourceIDs []int 
slice still aliases. In practice this almost certainly never matters (nobody 
mutates SourceIDs), but if you wanted the comment to be airtight, either a 
slices.Clone on SourceIDs per field  or just softening the wording to something 
like "shallow copy — partition fields aren't mutated in this codebase" would do 
it.  



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to