This is an automated email from the ASF dual-hosted git repository.
JingsongLi pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/paimon.git
The following commit(s) were added to refs/heads/master by this push:
new b995d537df [python] Fix with_projection silently dropping top-level
fields whose names contain a dot (#7955)
b995d537df is described below
commit b995d537dfd3e66d395ea58c515b47b29a70c895
Author: XiaoHongbo <[email protected]>
AuthorDate: Mon May 25 17:49:11 2026 +0800
[python] Fix with_projection silently dropping top-level fields whose names
contain a dot (#7955)
---
paimon-python/pypaimon/read/read_builder.py | 13 +++--
.../pypaimon/tests/test_nested_projection_e2e.py | 66 ++++++++++++++++++++++
2 files changed, 74 insertions(+), 5 deletions(-)
diff --git a/paimon-python/pypaimon/read/read_builder.py
b/paimon-python/pypaimon/read/read_builder.py
index 2ee9a39040..5537c97dd6 100644
--- a/paimon-python/pypaimon/read/read_builder.py
+++ b/paimon-python/pypaimon/read/read_builder.py
@@ -58,6 +58,9 @@ class ReadBuilder:
only callers see the same observable behaviour as before — the
dotted form is opt-in. Unknown names are silently skipped to
preserve the pre-existing contract.
+
+ Precedence: if a dotted name matches an actual top-level field, the
+ top-level match wins and the name is not walked as a struct path.
"""
self._projection = projection
if projection and any('.' in name for name in projection):
@@ -164,13 +167,13 @@ class ReadBuilder:
paths: List[List[int]] = []
for name in names:
- if '.' not in name:
- if name not in top_index:
- # Silently skip unknown names — preserves the
- # pre-existing contract from the plain top-level path.
- continue
+ # Dot can be part of a top-level field name, not only a struct path
+ # separator. Top-level match takes precedence over struct walk.
+ if name in top_index:
paths.append([top_index[name]])
continue
+ if '.' not in name:
+ continue
parts = name.split('.')
top = parts[0]
if top not in top_index:
diff --git a/paimon-python/pypaimon/tests/test_nested_projection_e2e.py
b/paimon-python/pypaimon/tests/test_nested_projection_e2e.py
index 9cc37c0a83..50a239a1d8 100644
--- a/paimon-python/pypaimon/tests/test_nested_projection_e2e.py
+++ b/paimon-python/pypaimon/tests/test_nested_projection_e2e.py
@@ -95,6 +95,50 @@ class AppendOnlyNestedParquetTest(_AppendOnlyNestedBase):
{'mv_latest_version': 200, 'val': 'y', 'mv_latest_value': 'b'},
{'mv_latest_version': 300, 'val': 'z', 'mv_latest_value': 'c'}])
+ def test_dotted_top_level_field_kept(self):
+ pa_schema = pa.schema([
+ ('id', pa.int64()),
+ ('media.left', pa.string()),
+ ])
+ identifier = 'default.ao_dotted_top_level'
+ self.catalog.create_table(
+ identifier,
+ Schema.from_pyarrow_schema(pa_schema, options={'bucket': '-1'}),
+ False)
+ table = self.catalog.get_table(identifier)
+ wb = table.new_batch_write_builder()
+ w = wb.new_write()
+ w.write_arrow(pa.Table.from_pylist(
+ [{'id': 1, 'media.left': 'hello'}], schema=pa_schema))
+ wb.new_commit().commit(w.prepare_commit())
+ w.close()
+
+ rb = table.new_read_builder().with_projection(['id', 'media.left'])
+ got = rb.new_read().to_arrow(rb.new_scan().plan().splits()).to_pylist()
+ self.assertEqual(got, [{'id': 1, 'media.left': 'hello'}])
+
+ def test_unknown_dotted_name_silently_skipped(self):
+ pa_schema = pa.schema([
+ ('id', pa.int64()),
+ ('plain', pa.string()),
+ ])
+ identifier = 'default.ao_unknown_dotted'
+ self.catalog.create_table(
+ identifier,
+ Schema.from_pyarrow_schema(pa_schema, options={'bucket': '-1'}),
+ False)
+ table = self.catalog.get_table(identifier)
+ wb = table.new_batch_write_builder()
+ w = wb.new_write()
+ w.write_arrow(pa.Table.from_pylist(
+ [{'id': 1, 'plain': 'x'}], schema=pa_schema))
+ wb.new_commit().commit(w.prepare_commit())
+ w.close()
+
+ rb = table.new_read_builder().with_projection(['id',
'nonexistent.foo'])
+ got = rb.new_read().to_arrow(rb.new_scan().plan().splits()).to_pylist()
+ self.assertEqual(got, [{'id': 1}])
+
def test_top_level_only_projection_unchanged(self):
"""A projection without dots must keep the existing top-level
path — file-level pushdown still asks for plain column names,
@@ -235,6 +279,28 @@ class PrimaryKeyNestedTest(_AppendOnlyNestedBase):
arrow.column('val').to_pylist()))
self.assertEqual(rows, [(1, 100, 'x'), (2, 200, 'y'), (3, 300, 'z')])
+ def test_dotted_top_level_field_kept(self):
+ pa_schema = pa.schema([
+ ('id', pa.int64()),
+ ('media.left', pa.string()),
+ ])
+ identifier = 'default.pk_dotted_top_level'
+ schema = Schema.from_pyarrow_schema(
+ pa_schema, primary_keys=['id'], options={'bucket': '1'})
+ self.catalog.create_table(identifier, schema, False)
+ table = self.catalog.get_table(identifier)
+ rows = [{'id': 1, 'media.left': 'hello'}]
+ for _ in range(2):
+ wb = table.new_batch_write_builder()
+ w = wb.new_write()
+ w.write_arrow(pa.Table.from_pylist(rows, schema=pa_schema))
+ wb.new_commit().commit(w.prepare_commit())
+ w.close()
+
+ rb = table.new_read_builder().with_projection(['id', 'media.left'])
+ got = rb.new_read().to_arrow(rb.new_scan().plan().splits()).to_pylist()
+ self.assertEqual(got, [{'id': 1, 'media.left': 'hello'}])
+
def test_avro_extracts_single_nested_leaf(self):
# Avro PK reads resolve DataFields through ``full_fields_map`` which
# historically only covered merge-internal aliases; without the