(paimon) branch master updated: [python] Fix stats evolution index mapping to use field ID instead of field name (#7593)

lzljs3620320 Mon, 06 Apr 2026 20:44:53 -0700

This is an automated email from the ASF dual-hosted git repository.

lzljs3620320 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/paimon.git



The following commit(s) were added to refs/heads/master by this push:
     new b8cc8cdf65 [python] Fix stats evolution index mapping to use field ID 
instead of field name (#7593)
b8cc8cdf65 is described below

commit b8cc8cdf650689679921314b405d7887b367421f
Author: Jiajia Li <[email protected]>
AuthorDate: Tue Apr 7 11:44:33 2026 +0800

    [python] Fix stats evolution index mapping to use field ID instead of field 
name (#7593)
    
    SimpleStatsEvolutions._create_index_cast_mapping was using field.name to
    build the mapping between table schema and data schema. This breaks
    after column renames because names change while field IDs remain stable.
    Changed to use field.id, aligning with Java's
    SchemaEvolutionUtil.createIndexMapping.
---
 .../pypaimon/manifest/simple_stats_evolutions.py   |  10 +-
 .../tests/manifest/simple_stats_evolutions_test.py | 156 +++++++++++++++++++++
 2 files changed, 162 insertions(+), 4 deletions(-)

diff --git a/paimon-python/pypaimon/manifest/simple_stats_evolutions.py 
b/paimon-python/pypaimon/manifest/simple_stats_evolutions.py
index f7f5946162..a3c235aff9 100644
--- a/paimon-python/pypaimon/manifest/simple_stats_evolutions.py
+++ b/paimon-python/pypaimon/manifest/simple_stats_evolutions.py
@@ -55,13 +55,15 @@ class SimpleStatsEvolutions:
         Create index and cast mapping between table fields and data fields.
         This is a simplified implementation.
         """
-        # Create a mapping from field names to indices in data_fields
-        data_field_map = {field.name: i for i, field in enumerate(data_fields)}
+        # Create a mapping from field IDs to indices in data_fields.
+        # Field IDs are stable across schema evolution (including column 
renames),
+        # while field names may change.
+        data_field_map = {field.id: i for i, field in enumerate(data_fields)}
 
         index_mapping = []
         for table_field in table_fields:
-            if table_field.name in data_field_map:
-                index_mapping.append(data_field_map[table_field.name])
+            if table_field.id in data_field_map:
+                index_mapping.append(data_field_map[table_field.id])
             else:
                 index_mapping.append(-1)  # Field not found in data schema
 
diff --git 
a/paimon-python/pypaimon/tests/manifest/simple_stats_evolutions_test.py 
b/paimon-python/pypaimon/tests/manifest/simple_stats_evolutions_test.py
new file mode 100644
index 0000000000..cfc811eaab
--- /dev/null
+++ b/paimon-python/pypaimon/tests/manifest/simple_stats_evolutions_test.py
@@ -0,0 +1,156 @@
+################################################################################
+#  Licensed to the Apache Software Foundation (ASF) under one
+#  or more contributor license agreements.  See the NOTICE file
+#  distributed with this work for additional information
+#  regarding copyright ownership.  The ASF licenses this file
+#  to you under the Apache License, Version 2.0 (the
+#  "License"); you may not use this file except in compliance
+#  with the License.  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+
+import unittest
+
+from pypaimon.manifest.simple_stats_evolutions import SimpleStatsEvolutions
+from pypaimon.schema.data_types import DataField, AtomicType
+from pypaimon.manifest.schema.simple_stats import SimpleStats
+from pypaimon.table.row.generic_row import GenericRow
+
+
+class SimpleStatsEvolutionsTest(unittest.TestCase):
+
+    def _make_fields(self, field_defs):
+        """Helper: field_defs is list of (id, name, type_str)."""
+        return [DataField(fid, fname, AtomicType(ftype)) for fid, fname, ftype 
in field_defs]
+
+    def _make_stats(self, fields, min_vals, max_vals, null_counts):
+        """Helper: build SimpleStats from parallel lists."""
+        df = [DataField(fid, fname, AtomicType(ftype)) for fid, fname, ftype 
in fields]
+        return SimpleStats(GenericRow(min_vals, df), GenericRow(max_vals, df), 
null_counts)
+
+    def test_same_schema_no_mapping(self):
+        """When data schema == table schema, index_mapping should be None."""
+        fields = self._make_fields([(0, 'a', 'INT'), (1, 'b', 'STRING')])
+        schemas = {0: fields}
+        evolutions = SimpleStatsEvolutions(lambda sid: schemas[sid], 0)
+        evolution = evolutions.get_or_create(0)
+        self.assertIsNone(evolution.index_mapping)
+
+    def test_added_column(self):
+        """New column: mapping is -1, null_count = row_count."""
+        data_fields = self._make_fields([(0, 'a', 'INT'), (1, 'b', 'INT')])
+        table_fields = self._make_fields([(0, 'a', 'INT'), (1, 'b', 'INT'), 
(2, 'c', 'INT')])
+        schemas = {0: data_fields, 1: table_fields}
+        evolutions = SimpleStatsEvolutions(lambda sid: schemas[sid], 1)
+        evolution = evolutions.get_or_create(0)
+
+        self.assertEqual(evolution.index_mapping, [0, 1, -1])
+
+        stats = self._make_stats([(0, 'a', 'INT'), (1, 'b', 'INT')],
+                                 [1, 2], [10, 20], [0, 5])
+        evolved = evolution.evolution(stats, row_count=500, stats_fields=None)
+        self.assertIsNone(evolved.min_values.get_field(2))
+        self.assertIsNone(evolved.max_values.get_field(2))
+        self.assertEqual(evolved.null_counts, [0, 5, 500])
+
+    def test_dropped_column(self):
+        """Dropped column is excluded from mapping."""
+        data_fields = self._make_fields([(0, 'a', 'INT'), (1, 'b', 'STRING'), 
(2, 'c', 'BIGINT')])
+        table_fields = self._make_fields([(0, 'a', 'INT'), (2, 'c', 'BIGINT')])
+        schemas = {0: data_fields, 1: table_fields}
+        evolutions = SimpleStatsEvolutions(lambda sid: schemas[sid], 1)
+        evolution = evolutions.get_or_create(0)
+        # a->0, c->2 (b dropped)
+        self.assertEqual(evolution.index_mapping, [0, 2])
+
+    def test_renamed_column(self):
+        """Column rename: matched by field ID, not name."""
+        data_fields = self._make_fields([(0, 'old_name', 'INT'), (1, 'b', 
'INT')])
+        table_fields = self._make_fields([(0, 'new_name', 'INT'), (1, 'b', 
'INT')])
+        schemas = {0: data_fields, 1: table_fields}
+        evolutions = SimpleStatsEvolutions(lambda sid: schemas[sid], 1)
+        evolution = evolutions.get_or_create(0)
+
+        self.assertEqual(evolution.index_mapping, [0, 1])
+        self.assertNotEqual(evolution.index_mapping[0], -1)  # not missing
+
+        stats = self._make_stats([(0, 'old_name', 'INT'), (1, 'b', 'INT')],
+                                 [10, 20], [100, 200], [0, 5])
+        evolved = evolution.evolution(stats, row_count=1000, stats_fields=None)
+        self.assertEqual(evolved.min_values.get_field(0), 10)
+        self.assertEqual(evolved.min_values.get_field(1), 20)
+        self.assertEqual(evolved.max_values.get_field(0), 100)
+        self.assertEqual(evolved.max_values.get_field(1), 200)
+        self.assertEqual(evolved.null_counts, [0, 5])
+
+    def test_reordered_columns(self):
+        """Column order differs between schemas, matched by field ID."""
+        data_fields = self._make_fields([(0, 'a', 'INT'), (1, 'b', 'STRING'), 
(2, 'c', 'BIGINT')])
+        table_fields = self._make_fields([(2, 'c', 'BIGINT'), (0, 'a', 'INT'), 
(1, 'b', 'STRING')])
+        schemas = {0: data_fields, 1: table_fields}
+        evolutions = SimpleStatsEvolutions(lambda sid: schemas[sid], 1)
+        evolution = evolutions.get_or_create(0)
+        self.assertEqual(evolution.index_mapping, [2, 0, 1])
+
+    def test_caching(self):
+        """Evolution objects are cached by schema ID."""
+        fields = self._make_fields([(0, 'a', 'INT')])
+        schemas = {0: fields, 1: fields}
+        evolutions = SimpleStatsEvolutions(lambda sid: schemas[sid], 1)
+        e1 = evolutions.get_or_create(0)
+        e2 = evolutions.get_or_create(0)
+        self.assertIs(e1, e2)
+
+    def test_complex_evolution(self):
+        """Reorder + add + drop + rename combined.
+
+        Data schema (id=0): a(0), b(1), c(2), d(3)
+        Table schema (id=1): cc(2), a(0), d(3), e(4, new), bb(1)
+
+        Verifies index_mapping, min/max projection, and null count evolution.
+        """
+        data_fields = self._make_fields([
+            (0, 'a', 'INT'), (1, 'b', 'INT'), (2, 'c', 'INT'), (3, 'd', 'INT')
+        ])
+        table_fields = self._make_fields([
+            (2, 'cc', 'INT'), (0, 'a', 'INT'), (3, 'd', 'INT'), (4, 'e', 
'INT'), (1, 'bb', 'INT')
+        ])
+        schemas = {0: data_fields, 1: table_fields}
+        evolutions = SimpleStatsEvolutions(lambda sid: schemas[sid], 1)
+        evolution = evolutions.get_or_create(0)
+
+        # Mapping: cc(id=2)->2, a(id=0)->0, d(id=3)->3, e(id=4)->-1, 
bb(id=1)->1
+        self.assertEqual(evolution.index_mapping, [2, 0, 3, -1, 1])
+
+        stats = self._make_stats(
+            [(0, 'a', 'INT'), (1, 'b', 'INT'), (2, 'c', 'INT'), (3, 'd', 
'INT')],
+            [10, 20, 30, 40], [100, 200, 300, 400], [1, 2, 3, 4])
+        evolved = evolution.evolution(stats, row_count=1000, stats_fields=None)
+
+        # min: cc->30, a->10, d->40, e->None, bb->20
+        self.assertEqual(evolved.min_values.get_field(0), 30)
+        self.assertEqual(evolved.min_values.get_field(1), 10)
+        self.assertEqual(evolved.min_values.get_field(2), 40)
+        self.assertIsNone(evolved.min_values.get_field(3))
+        self.assertEqual(evolved.min_values.get_field(4), 20)
+
+        # max: cc->300, a->100, d->400, e->None, bb->200
+        self.assertEqual(evolved.max_values.get_field(0), 300)
+        self.assertEqual(evolved.max_values.get_field(1), 100)
+        self.assertEqual(evolved.max_values.get_field(2), 400)
+        self.assertIsNone(evolved.max_values.get_field(3))
+        self.assertEqual(evolved.max_values.get_field(4), 200)
+
+        # null counts: cc->3, a->1, d->4, e->1000(row_count), bb->2
+        self.assertEqual(evolved.null_counts, [3, 1, 4, 1000, 2])
+
+
+if __name__ == '__main__':
+    unittest.main()

(paimon) branch master updated: [python] Fix stats evolution index mapping to use field ID instead of field name (#7593)

Reply via email to