This is an automated email from the ASF dual-hosted git repository.
XiaoHongbo-Hope pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/paimon.git
The following commit(s) were added to refs/heads/master by this push:
new cc9bb8e165 [python] Fix manifest read failure when _WRITE_COLS
contains system fields (#8131)
cc9bb8e165 is described below
commit cc9bb8e165be69c05e615e008d1b9e1a4d26b187
Author: XiaoHongbo <[email protected]>
AuthorDate: Fri Jun 5 17:39:20 2026 +0800
[python] Fix manifest read failure when _WRITE_COLS contains system fields
(#8131)
### Purpose
When reading a table whose data files have `_WRITE_COLS` containing
system fields (e.g. `_ROW_ID`, `_SEQUENCE_NUMBER`), the read
fails with:
KeyError: '_ROW_ID'
Aligns with the Java-side fix in #7797 — skip metadata fields that are
not in the table schema when resolving value stats fields from
`_WRITE_COLS`.
## Test
- `test_read_write_cols_with_system_field`
---
.../pypaimon/manifest/manifest_file_manager.py | 5 ++-
.../tests/manifest/manifest_manager_test.py | 43 ++++++++++++++++++++++
2 files changed, 47 insertions(+), 1 deletion(-)
diff --git a/paimon-python/pypaimon/manifest/manifest_file_manager.py
b/paimon-python/pypaimon/manifest/manifest_file_manager.py
index 973bd75534..6a0fd7dd6a 100644
--- a/paimon-python/pypaimon/manifest/manifest_file_manager.py
+++ b/paimon-python/pypaimon/manifest/manifest_file_manager.py
@@ -190,7 +190,10 @@ class ManifestFileManager:
fields = schema_fields
else:
read_fields = file_dict['_WRITE_COLS']
- fields = [self.table.field_dict[col] for col in
read_fields]
+ # writeCols may contain metadata fields (e.g. _ROW_ID,
_SEQUENCE_NUMBER)
+ data_field_dict = {f.name: f for f in schema_fields}
+ fields = [data_field_dict[col] for col in read_fields
+ if col in data_field_dict]
else:
fields = schema_fields
elif not file_dict['_VALUE_STATS_COLS']:
diff --git a/paimon-python/pypaimon/tests/manifest/manifest_manager_test.py
b/paimon-python/pypaimon/tests/manifest/manifest_manager_test.py
index 80cb7ed3d6..2c6423af3f 100644
--- a/paimon-python/pypaimon/tests/manifest/manifest_manager_test.py
+++ b/paimon-python/pypaimon/tests/manifest/manifest_manager_test.py
@@ -36,6 +36,7 @@ from pypaimon.manifest.schema.data_file_meta import
DataFileMeta
from pypaimon.manifest.schema.manifest_entry import ManifestEntry
from pypaimon.manifest.schema.manifest_file_meta import ManifestFileMeta
from pypaimon.manifest.schema.simple_stats import SimpleStats
+from pypaimon.schema.data_types import AtomicType, DataField
from pypaimon.schema.schema import Schema
from pypaimon.table.row.generic_row import GenericRow
@@ -282,6 +283,48 @@ class ManifestFileManagerTest(_ManifestManagerSetup):
"test-manifest.avro", manifest_entry_filter=lambda e: e.bucket ==
0)
self.assertEqual(len(result_filtered), 2)
+ def test_read_write_cols_with_system_field(self):
+ manager = self._make_manager()
+
+ id_field = DataField(0, 'id', AtomicType('INT', nullable=True))
+ min_row = GenericRow([1], [id_field])
+ max_row = GenericRow([10], [id_field])
+ value_stats = SimpleStats(
+ min_values=min_row, max_values=max_row, null_counts=[2])
+
+ entry = ManifestEntry(
+ kind=0,
+ partition=_EMPTY_ROW,
+ bucket=0,
+ total_buckets=1,
+ file=DataFileMeta(
+ file_name="data-dirty.parquet", file_size=1024, row_count=50,
+ min_key=_EMPTY_ROW, max_key=_EMPTY_ROW,
+ key_stats=_EMPTY_STATS, value_stats=value_stats,
+ min_sequence_number=1, max_sequence_number=50,
+ schema_id=0, level=0, extra_files=[],
+ creation_time=Timestamp.from_epoch_millis(0),
+ delete_row_count=0, embedded_index=None, file_source=None,
+ value_stats_cols=None, external_path=None,
+ first_row_id=0,
+ write_cols=["id", "_ROW_ID", "_SEQUENCE_NUMBER"],
+ ),
+ )
+ manager.write("dirty-manifest.avro", [entry])
+
+ entries = manager.read("dirty-manifest.avro", drop_stats=False)
+ self.assertEqual(len(entries), 1)
+ self.assertEqual(
+ entries[0].file.write_cols, ["id", "_ROW_ID", "_SEQUENCE_NUMBER"])
+
+ read_stats = entries[0].file.value_stats
+ stats_field_names = [f.name for f in read_stats.min_values.fields]
+ self.assertEqual(stats_field_names, ["id"])
+
+ self.assertEqual(read_stats.min_values.get_field(0), 1)
+ self.assertEqual(read_stats.max_values.get_field(0), 10)
+ self.assertEqual(read_stats.null_counts, [2])
+
class ManifestListManagerTest(_ManifestManagerSetup):
"""Tests for ManifestListManager."""