This is an automated email from the ASF dual-hosted git repository.

lzljs3620320 pushed a commit to branch release-1.3
in repository https://gitbox.apache.org/repos/asf/paimon.git

commit 71128279c9236c5b599508c0b862929e31dd419c
Author: Jingsong Lee <[email protected]>
AuthorDate: Mon Oct 20 21:08:02 2025 +0800

    [python] Drop stats for manifest entries reading (#6429)
---
 .../pypaimon/manifest/manifest_file_manager.py     |  4 +++-
 .../pypaimon/manifest/schema/data_file_meta.py     | 26 ++++++++++++++++++++++
 .../pypaimon/manifest/schema/manifest_entry.py     | 10 +++++++++
 .../pypaimon/manifest/schema/simple_stats.py       | 11 +++++++++
 paimon-python/pypaimon/tests/predicates_test.py    | 12 +++++-----
 .../pypaimon/tests/py36/rest_ao_read_write_test.py |  6 +++--
 paimon-python/pypaimon/tests/reader_base_test.py   | 14 +++++++++---
 7 files changed, 71 insertions(+), 12 deletions(-)

diff --git a/paimon-python/pypaimon/manifest/manifest_file_manager.py 
b/paimon-python/pypaimon/manifest/manifest_file_manager.py
index 07e434dd37..bb9251df7e 100644
--- a/paimon-python/pypaimon/manifest/manifest_file_manager.py
+++ b/paimon-python/pypaimon/manifest/manifest_file_manager.py
@@ -41,7 +41,7 @@ class ManifestFileManager:
         self.primary_key_fields = 
self.table.table_schema.get_primary_key_fields()
         self.trimmed_primary_key_fields = 
self.table.table_schema.get_trimmed_primary_key_fields()
 
-    def read(self, manifest_file_name: str, manifest_entry_filter=None) -> 
List[ManifestEntry]:
+    def read(self, manifest_file_name: str, manifest_entry_filter=None, 
drop_stats=True) -> List[ManifestEntry]:
         manifest_file_path = self.manifest_path / manifest_file_name
 
         entries = []
@@ -107,6 +107,8 @@ class ManifestFileManager:
             )
             if manifest_entry_filter is not None and not 
manifest_entry_filter(entry):
                 continue
+            if drop_stats:
+                entry = entry.copy_without_stats()
             entries.append(entry)
         return entries
 
diff --git a/paimon-python/pypaimon/manifest/schema/data_file_meta.py 
b/paimon-python/pypaimon/manifest/schema/data_file_meta.py
index 1d1bcb56fb..8206061c84 100644
--- a/paimon-python/pypaimon/manifest/schema/data_file_meta.py
+++ b/paimon-python/pypaimon/manifest/schema/data_file_meta.py
@@ -61,6 +61,32 @@ class DataFileMeta:
         path_builder = path_builder / ("bucket-" + str(bucket)) / 
self.file_name
         self.file_path = str(path_builder)
 
+    def copy_without_stats(self) -> 'DataFileMeta':
+        """Create a new DataFileMeta without value statistics."""
+        return DataFileMeta(
+            file_name=self.file_name,
+            file_size=self.file_size,
+            row_count=self.row_count,
+            min_key=self.min_key,
+            max_key=self.max_key,
+            key_stats=self.key_stats,
+            value_stats=SimpleStats.empty_stats(),
+            min_sequence_number=self.min_sequence_number,
+            max_sequence_number=self.max_sequence_number,
+            schema_id=self.schema_id,
+            level=self.level,
+            extra_files=self.extra_files,
+            creation_time=self.creation_time,
+            delete_row_count=self.delete_row_count,
+            embedded_index=self.embedded_index,
+            file_source=self.file_source,
+            value_stats_cols=[],
+            external_path=self.external_path,
+            first_row_id=self.first_row_id,
+            write_cols=self.write_cols,
+            file_path=self.file_path
+        )
+
     def assign_first_row_id(self, first_row_id: int) -> 'DataFileMeta':
         """Create a new DataFileMeta with the assigned first_row_id."""
         return DataFileMeta(
diff --git a/paimon-python/pypaimon/manifest/schema/manifest_entry.py 
b/paimon-python/pypaimon/manifest/schema/manifest_entry.py
index 9608fbbd37..b1fd244daf 100644
--- a/paimon-python/pypaimon/manifest/schema/manifest_entry.py
+++ b/paimon-python/pypaimon/manifest/schema/manifest_entry.py
@@ -31,6 +31,16 @@ class ManifestEntry:
     total_buckets: int
     file: DataFileMeta
 
+    def copy_without_stats(self) -> 'ManifestEntry':
+        """Create a new ManifestEntry without value statistics."""
+        return ManifestEntry(
+            kind=self.kind,
+            partition=self.partition,
+            bucket=self.bucket,
+            total_buckets=self.total_buckets,
+            file=self.file.copy_without_stats()
+        )
+
     def assign_first_row_id(self, first_row_id: int) -> 'ManifestEntry':
         """Create a new ManifestEntry with the assigned first_row_id."""
         return ManifestEntry(
diff --git a/paimon-python/pypaimon/manifest/schema/simple_stats.py 
b/paimon-python/pypaimon/manifest/schema/simple_stats.py
index 45982491b9..19816fdd0f 100644
--- a/paimon-python/pypaimon/manifest/schema/simple_stats.py
+++ b/paimon-python/pypaimon/manifest/schema/simple_stats.py
@@ -18,6 +18,7 @@
 
 from dataclasses import dataclass
 from typing import List, Optional
+from typing import ClassVar
 
 from pypaimon.table.row.generic_row import GenericRow
 
@@ -28,6 +29,16 @@ class SimpleStats:
     max_values: GenericRow
     null_counts: Optional[List[int]]
 
+    _empty_stats: ClassVar[object] = None
+
+    @classmethod
+    def empty_stats(cls):
+        if cls._empty_stats is None:
+            min_values = GenericRow([], [])
+            max_values = GenericRow([], [])
+            cls._empty_stats = cls(min_values, max_values, None)
+        return cls._empty_stats
+
 
 SIMPLE_STATS_SCHEMA = {
     "type": "record",
diff --git a/paimon-python/pypaimon/tests/predicates_test.py 
b/paimon-python/pypaimon/tests/predicates_test.py
index a3a0e3229c..6158d1d88b 100644
--- a/paimon-python/pypaimon/tests/predicates_test.py
+++ b/paimon-python/pypaimon/tests/predicates_test.py
@@ -454,20 +454,20 @@ class PredicateTest(unittest.TestCase):
             if split.partition.values == ["p1", 2]:
                 count += 1
                 self.assertEqual(len(split.files), 1)
-                min_values = split.files[0].value_stats.min_values.to_dict()
-                max_values = split.files[0].value_stats.max_values.to_dict()
+                min_values = split.files[0].key_stats.min_values.to_dict()
+                max_values = split.files[0].key_stats.max_values.to_dict()
                 self.assertTrue(min_values["key1"] == 1 and min_values["key2"] 
== "e"
                                 and max_values["key1"] == 4 and 
max_values["key2"] == "h")
             elif split.partition.values == ["p2", 2]:
                 count += 1
-                min_values = split.files[0].value_stats.min_values.to_dict()
-                max_values = split.files[0].value_stats.max_values.to_dict()
+                min_values = split.files[0].key_stats.min_values.to_dict()
+                max_values = split.files[0].key_stats.max_values.to_dict()
                 self.assertTrue(min_values["key1"] == 5 and min_values["key2"] 
== "a"
                                 and max_values["key1"] == 8 and 
max_values["key2"] == "d")
             elif split.partition.values == ["p1", 1]:
                 count += 1
-                min_values = split.files[0].value_stats.min_values.to_dict()
-                max_values = split.files[0].value_stats.max_values.to_dict()
+                min_values = split.files[0].key_stats.min_values.to_dict()
+                max_values = split.files[0].key_stats.max_values.to_dict()
                 self.assertTrue(min_values["key1"] == max_values["key1"] == 7
                                 and max_values["key2"] == max_values["key2"] 
== "b")
         self.assertEqual(count, 3)
diff --git a/paimon-python/pypaimon/tests/py36/rest_ao_read_write_test.py 
b/paimon-python/pypaimon/tests/py36/rest_ao_read_write_test.py
index a184f64866..9be66d9759 100644
--- a/paimon-python/pypaimon/tests/py36/rest_ao_read_write_test.py
+++ b/paimon-python/pypaimon/tests/py36/rest_ao_read_write_test.py
@@ -180,7 +180,9 @@ class RESTAOReadWritePy36Test(RESTBaseTest):
         latest_snapshot = SnapshotManager(table).get_latest_snapshot()
         manifest_files = 
table_scan.starting_scanner.manifest_list_manager.read_all(latest_snapshot)
         manifest_entries = 
table_scan.starting_scanner.manifest_file_manager.read(
-            manifest_files[0].file_name, lambda row: 
table_scan.starting_scanner._filter_manifest_entry(row))
+            manifest_files[0].file_name,
+            lambda row: 
table_scan.starting_scanner._filter_manifest_entry(row),
+            drop_stats=False)
         min_value_stats = 
manifest_entries[0].file.value_stats.min_values.values
         max_value_stats = 
manifest_entries[0].file.value_stats.max_values.values
         expected_min_values = [col[0].as_py() for col in expect_data]
@@ -849,7 +851,7 @@ class RESTAOReadWritePy36Test(RESTBaseTest):
         manifest_manager.write(manifest_file_name, [entry])
 
         # Read the manifest entry back
-        entries = manifest_manager.read(manifest_file_name)
+        entries = manifest_manager.read(manifest_file_name, drop_stats=False)
 
         # Verify we have exactly one entry
         self.assertEqual(len(entries), 1)
diff --git a/paimon-python/pypaimon/tests/reader_base_test.py 
b/paimon-python/pypaimon/tests/reader_base_test.py
index d158c824ef..a06e120e95 100644
--- a/paimon-python/pypaimon/tests/reader_base_test.py
+++ b/paimon-python/pypaimon/tests/reader_base_test.py
@@ -210,14 +210,22 @@ class ReaderBasicTest(unittest.TestCase):
         read_builder = table.new_read_builder()
         table_scan = read_builder.new_scan()
         table_read = read_builder.new_read()
-        actual_data = table_read.to_arrow(table_scan.plan().splits())
+        splits = table_scan.plan().splits()
+
+        # assert data file without stats
+        first_file = splits[0].files[0]
+        self.assertEqual(first_file.value_stats_cols, [])
+        self.assertEqual(first_file.value_stats, SimpleStats.empty_stats())
+
+        # assert equal
+        actual_data = table_read.to_arrow(splits)
         self.assertEqual(actual_data, expect_data)
 
         # to test GenericRow ability
         latest_snapshot = SnapshotManager(table).get_latest_snapshot()
         manifest_files = 
table_scan.starting_scanner.manifest_list_manager.read_all(latest_snapshot)
         manifest_entries = 
table_scan.starting_scanner.manifest_file_manager.read(
-            manifest_files[0].file_name, lambda row: 
table_scan.starting_scanner._filter_manifest_entry(row))
+            manifest_files[0].file_name, lambda row: 
table_scan.starting_scanner._filter_manifest_entry(row), False)
         min_value_stats = 
manifest_entries[0].file.value_stats.min_values.values
         max_value_stats = 
manifest_entries[0].file.value_stats.max_values.values
         expected_min_values = [col[0].as_py() for col in expect_data]
@@ -627,7 +635,7 @@ class ReaderBasicTest(unittest.TestCase):
         manifest_manager.write(manifest_file_name, [entry])
 
         # Read the manifest entry back
-        entries = manifest_manager.read(manifest_file_name)
+        entries = manifest_manager.read(manifest_file_name, drop_stats=False)
 
         # Verify we have exactly one entry
         self.assertEqual(len(entries), 1)

Reply via email to