This is an automated email from the ASF dual-hosted git repository.

XiaoHongbo-Hope pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/paimon.git


The following commit(s) were added to refs/heads/master by this push:
     new cc9bb8e165 [python] Fix manifest read failure when _WRITE_COLS 
contains system fields (#8131)
cc9bb8e165 is described below

commit cc9bb8e165be69c05e615e008d1b9e1a4d26b187
Author: XiaoHongbo <[email protected]>
AuthorDate: Fri Jun 5 17:39:20 2026 +0800

    [python] Fix manifest read failure when _WRITE_COLS contains system fields 
(#8131)
    
    ### Purpose
    When reading a table whose data files have `_WRITE_COLS` containing
    system fields (e.g. `_ROW_ID`, `_SEQUENCE_NUMBER`), the read
      fails with:
      KeyError: '_ROW_ID'
    
    Aligns with the Java-side fix in #7797 — skip metadata fields that are
    not in the table schema when resolving value stats fields from
    `_WRITE_COLS`.
    
      ## Test
    
      - `test_read_write_cols_with_system_field`
---
 .../pypaimon/manifest/manifest_file_manager.py     |  5 ++-
 .../tests/manifest/manifest_manager_test.py        | 43 ++++++++++++++++++++++
 2 files changed, 47 insertions(+), 1 deletion(-)

diff --git a/paimon-python/pypaimon/manifest/manifest_file_manager.py 
b/paimon-python/pypaimon/manifest/manifest_file_manager.py
index 973bd75534..6a0fd7dd6a 100644
--- a/paimon-python/pypaimon/manifest/manifest_file_manager.py
+++ b/paimon-python/pypaimon/manifest/manifest_file_manager.py
@@ -190,7 +190,10 @@ class ManifestFileManager:
                     fields = schema_fields
                 else:
                     read_fields = file_dict['_WRITE_COLS']
-                    fields = [self.table.field_dict[col] for col in 
read_fields]
+                    # writeCols may contain metadata fields (e.g. _ROW_ID, 
_SEQUENCE_NUMBER)
+                    data_field_dict = {f.name: f for f in schema_fields}
+                    fields = [data_field_dict[col] for col in read_fields
+                              if col in data_field_dict]
             else:
                 fields = schema_fields
         elif not file_dict['_VALUE_STATS_COLS']:
diff --git a/paimon-python/pypaimon/tests/manifest/manifest_manager_test.py 
b/paimon-python/pypaimon/tests/manifest/manifest_manager_test.py
index 80cb7ed3d6..2c6423af3f 100644
--- a/paimon-python/pypaimon/tests/manifest/manifest_manager_test.py
+++ b/paimon-python/pypaimon/tests/manifest/manifest_manager_test.py
@@ -36,6 +36,7 @@ from pypaimon.manifest.schema.data_file_meta import 
DataFileMeta
 from pypaimon.manifest.schema.manifest_entry import ManifestEntry
 from pypaimon.manifest.schema.manifest_file_meta import ManifestFileMeta
 from pypaimon.manifest.schema.simple_stats import SimpleStats
+from pypaimon.schema.data_types import AtomicType, DataField
 from pypaimon.schema.schema import Schema
 from pypaimon.table.row.generic_row import GenericRow
 
@@ -282,6 +283,48 @@ class ManifestFileManagerTest(_ManifestManagerSetup):
             "test-manifest.avro", manifest_entry_filter=lambda e: e.bucket == 
0)
         self.assertEqual(len(result_filtered), 2)
 
+    def test_read_write_cols_with_system_field(self):
+        manager = self._make_manager()
+
+        id_field = DataField(0, 'id', AtomicType('INT', nullable=True))
+        min_row = GenericRow([1], [id_field])
+        max_row = GenericRow([10], [id_field])
+        value_stats = SimpleStats(
+            min_values=min_row, max_values=max_row, null_counts=[2])
+
+        entry = ManifestEntry(
+            kind=0,
+            partition=_EMPTY_ROW,
+            bucket=0,
+            total_buckets=1,
+            file=DataFileMeta(
+                file_name="data-dirty.parquet", file_size=1024, row_count=50,
+                min_key=_EMPTY_ROW, max_key=_EMPTY_ROW,
+                key_stats=_EMPTY_STATS, value_stats=value_stats,
+                min_sequence_number=1, max_sequence_number=50,
+                schema_id=0, level=0, extra_files=[],
+                creation_time=Timestamp.from_epoch_millis(0),
+                delete_row_count=0, embedded_index=None, file_source=None,
+                value_stats_cols=None, external_path=None,
+                first_row_id=0,
+                write_cols=["id", "_ROW_ID", "_SEQUENCE_NUMBER"],
+            ),
+        )
+        manager.write("dirty-manifest.avro", [entry])
+
+        entries = manager.read("dirty-manifest.avro", drop_stats=False)
+        self.assertEqual(len(entries), 1)
+        self.assertEqual(
+            entries[0].file.write_cols, ["id", "_ROW_ID", "_SEQUENCE_NUMBER"])
+
+        read_stats = entries[0].file.value_stats
+        stats_field_names = [f.name for f in read_stats.min_values.fields]
+        self.assertEqual(stats_field_names, ["id"])
+
+        self.assertEqual(read_stats.min_values.get_field(0), 1)
+        self.assertEqual(read_stats.max_values.get_field(0), 10)
+        self.assertEqual(read_stats.null_counts, [2])
+
 
 class ManifestListManagerTest(_ManifestManagerSetup):
     """Tests for ManifestListManager."""

Reply via email to