This is an automated email from the ASF dual-hosted git repository.

JingsongLi pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/paimon.git


The following commit(s) were added to refs/heads/master by this push:
     new b995d537df [python] Fix with_projection silently dropping top-level 
fields whose names contain a dot (#7955)
b995d537df is described below

commit b995d537dfd3e66d395ea58c515b47b29a70c895
Author: XiaoHongbo <[email protected]>
AuthorDate: Mon May 25 17:49:11 2026 +0800

    [python] Fix with_projection silently dropping top-level fields whose names 
contain a dot (#7955)
---
 paimon-python/pypaimon/read/read_builder.py        | 13 +++--
 .../pypaimon/tests/test_nested_projection_e2e.py   | 66 ++++++++++++++++++++++
 2 files changed, 74 insertions(+), 5 deletions(-)

diff --git a/paimon-python/pypaimon/read/read_builder.py 
b/paimon-python/pypaimon/read/read_builder.py
index 2ee9a39040..5537c97dd6 100644
--- a/paimon-python/pypaimon/read/read_builder.py
+++ b/paimon-python/pypaimon/read/read_builder.py
@@ -58,6 +58,9 @@ class ReadBuilder:
         only callers see the same observable behaviour as before — the
         dotted form is opt-in. Unknown names are silently skipped to
         preserve the pre-existing contract.
+
+        Precedence: if a dotted name matches an actual top-level field, the
+        top-level match wins and the name is not walked as a struct path.
         """
         self._projection = projection
         if projection and any('.' in name for name in projection):
@@ -164,13 +167,13 @@ class ReadBuilder:
 
         paths: List[List[int]] = []
         for name in names:
-            if '.' not in name:
-                if name not in top_index:
-                    # Silently skip unknown names — preserves the
-                    # pre-existing contract from the plain top-level path.
-                    continue
+            # Dot can be part of a top-level field name, not only a struct path
+            # separator. Top-level match takes precedence over struct walk.
+            if name in top_index:
                 paths.append([top_index[name]])
                 continue
+            if '.' not in name:
+                continue
             parts = name.split('.')
             top = parts[0]
             if top not in top_index:
diff --git a/paimon-python/pypaimon/tests/test_nested_projection_e2e.py 
b/paimon-python/pypaimon/tests/test_nested_projection_e2e.py
index 9cc37c0a83..50a239a1d8 100644
--- a/paimon-python/pypaimon/tests/test_nested_projection_e2e.py
+++ b/paimon-python/pypaimon/tests/test_nested_projection_e2e.py
@@ -95,6 +95,50 @@ class AppendOnlyNestedParquetTest(_AppendOnlyNestedBase):
              {'mv_latest_version': 200, 'val': 'y', 'mv_latest_value': 'b'},
              {'mv_latest_version': 300, 'val': 'z', 'mv_latest_value': 'c'}])
 
+    def test_dotted_top_level_field_kept(self):
+        pa_schema = pa.schema([
+            ('id', pa.int64()),
+            ('media.left', pa.string()),
+        ])
+        identifier = 'default.ao_dotted_top_level'
+        self.catalog.create_table(
+            identifier,
+            Schema.from_pyarrow_schema(pa_schema, options={'bucket': '-1'}),
+            False)
+        table = self.catalog.get_table(identifier)
+        wb = table.new_batch_write_builder()
+        w = wb.new_write()
+        w.write_arrow(pa.Table.from_pylist(
+            [{'id': 1, 'media.left': 'hello'}], schema=pa_schema))
+        wb.new_commit().commit(w.prepare_commit())
+        w.close()
+
+        rb = table.new_read_builder().with_projection(['id', 'media.left'])
+        got = rb.new_read().to_arrow(rb.new_scan().plan().splits()).to_pylist()
+        self.assertEqual(got, [{'id': 1, 'media.left': 'hello'}])
+
+    def test_unknown_dotted_name_silently_skipped(self):
+        pa_schema = pa.schema([
+            ('id', pa.int64()),
+            ('plain', pa.string()),
+        ])
+        identifier = 'default.ao_unknown_dotted'
+        self.catalog.create_table(
+            identifier,
+            Schema.from_pyarrow_schema(pa_schema, options={'bucket': '-1'}),
+            False)
+        table = self.catalog.get_table(identifier)
+        wb = table.new_batch_write_builder()
+        w = wb.new_write()
+        w.write_arrow(pa.Table.from_pylist(
+            [{'id': 1, 'plain': 'x'}], schema=pa_schema))
+        wb.new_commit().commit(w.prepare_commit())
+        w.close()
+
+        rb = table.new_read_builder().with_projection(['id', 
'nonexistent.foo'])
+        got = rb.new_read().to_arrow(rb.new_scan().plan().splits()).to_pylist()
+        self.assertEqual(got, [{'id': 1}])
+
     def test_top_level_only_projection_unchanged(self):
         """A projection without dots must keep the existing top-level
         path — file-level pushdown still asks for plain column names,
@@ -235,6 +279,28 @@ class PrimaryKeyNestedTest(_AppendOnlyNestedBase):
             arrow.column('val').to_pylist()))
         self.assertEqual(rows, [(1, 100, 'x'), (2, 200, 'y'), (3, 300, 'z')])
 
+    def test_dotted_top_level_field_kept(self):
+        pa_schema = pa.schema([
+            ('id', pa.int64()),
+            ('media.left', pa.string()),
+        ])
+        identifier = 'default.pk_dotted_top_level'
+        schema = Schema.from_pyarrow_schema(
+            pa_schema, primary_keys=['id'], options={'bucket': '1'})
+        self.catalog.create_table(identifier, schema, False)
+        table = self.catalog.get_table(identifier)
+        rows = [{'id': 1, 'media.left': 'hello'}]
+        for _ in range(2):
+            wb = table.new_batch_write_builder()
+            w = wb.new_write()
+            w.write_arrow(pa.Table.from_pylist(rows, schema=pa_schema))
+            wb.new_commit().commit(w.prepare_commit())
+            w.close()
+
+        rb = table.new_read_builder().with_projection(['id', 'media.left'])
+        got = rb.new_read().to_arrow(rb.new_scan().plan().splits()).to_pylist()
+        self.assertEqual(got, [{'id': 1, 'media.left': 'hello'}])
+
     def test_avro_extracts_single_nested_leaf(self):
         # Avro PK reads resolve DataFields through ``full_fields_map`` which
         # historically only covered merge-internal aliases; without the

Reply via email to